[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[pygame] New strict MMX patch for transform.c



I am including a new patch for transform.c . It replaces the patches I submitted earlier. It cleans up the assembly code and and omits the NO_SSE flag. Profiling shows that filter_shrink_X_MMX is still nearly 4X faster than filter_shrink_X_ONLYC. No extra tweaking, by interleaving instructions, was done since that showed a less that 1% performance change (below the margin of error for the profile).

--
Lenard Lindstrom
<len-l@xxxxxxxxx>

1018c1018
<     long long One64 = 0x4000400040004000;
---
>     long long One64 = 0x4000400040004000ULL;
1025,1026c1025,1026
<         " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         " punpcklwd     %%mm7,      %%mm7;           "
>         " punpckldq     %%mm7,      %%mm7;           "
1042,1043c1042,1044
<         " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
<         " pshufw    $0, %%mm2,      %%mm2;           "
---
>         " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
>         " punpcklwd     %%mm2,      %%mm2;           "
>         " punpckldq     %%mm2,      %%mm2;           "
1049,1050c1050,1067
<         " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
---
>         " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
>         " psraw           $15,      %%mm5;           "
>         " pand          %%mm2,      %%mm5;           "
>         " movq          %%mm2,      %%mm6;           "
>         " psraw           $15,      %%mm6;           "
>         " pand          %%mm4,      %%mm6;           "
>         " pmulhw        %%mm4,      %%mm2;           "
>         " paddw         %%mm5,      %%mm2;           "
>         " paddw         %%mm6,      %%mm2;           "
>         " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
>         " psraw           $15,      %%mm5;           "
>         " pand          %%mm3,      %%mm5;           "
>         " movq          %%mm3,      %%mm6;           "
>         " psraw           $15,      %%mm6;           "
>         " pand          %%mm4,      %%mm6;           "
>         " pmulhw        %%mm4,      %%mm3;           "
>         " paddw         %%mm5,      %%mm3;           "
>         " paddw         %%mm6,      %%mm3;           "
1053c1070,1078
<         " pmulhuw       %%mm7,      %%mm2;           "
---
>         " movq          %%mm7,      %%mm5;           "
>         " psraw           $15,      %%mm5;           "
>         " pand          %%mm2,      %%mm5;           "
>         " movq          %%mm2,      %%mm6;           "
>         " psraw           $15,      %%mm6;           "
>         " pand          %%mm7,      %%mm6;           "
>         " pmulhw        %%mm7,      %%mm2;           "
>         " paddw         %%mm5,      %%mm2;           "
>         " paddw         %%mm6,      %%mm2;           "
1076,1077c1101,1102
<         " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         " punpcklwd     %%mm7,      %%mm7;           "
>         " punpckldq     %%mm7,      %%mm7;           "
1093,1094c1118,1120
<         " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
<         " pshufw    $0, %%mm2,      %%mm2;           "
---
>         " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
>         " punpcklwd     %%mm2,      %%mm2;           "
>         " punpckldq     %%mm2,      %%mm2;           "
1100,1101c1126,1143
<         " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
---
>         " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
>         " psraw           $15,      %%mm5;           "
>         " pand          %%mm2,      %%mm5;           "
>         " movq          %%mm2,      %%mm6;           "
>         " psraw           $15,      %%mm6;           "
>         " pand          %%mm4,      %%mm6;           "
>         " pmulhw        %%mm4,      %%mm2;           "
>         " paddw         %%mm5,      %%mm2;           "
>         " paddw         %%mm6,      %%mm2;           "
>         " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
>         " psraw           $15,      %%mm5;           "
>         " pand          %%mm3,      %%mm5;           "
>         " movq          %%mm3,      %%mm6;           "
>         " psraw           $15,      %%mm6;           "
>         " pand          %%mm4,      %%mm6;           "
>         " pmulhw        %%mm4,      %%mm3;           "
>         " paddw         %%mm5,      %%mm3;           "
>         " paddw         %%mm6,      %%mm3;           "
1104c1146,1154
<         " pmulhuw       %%mm7,      %%mm2;           "
---
>         " movq          %%mm7,      %%mm5;           "
>         " psraw           $15,      %%mm5;           "
>         " pand          %%mm2,      %%mm5;           "
>         " movq          %%mm2,      %%mm6;           "
>         " psraw           $15,      %%mm6;           "
>         " pand          %%mm7,      %%mm6;           "
>         " pmulhw        %%mm7,      %%mm2;           "
>         " paddw         %%mm5,      %%mm2;           "
>         " paddw         %%mm6,      %%mm2;           "
1202c1252
<     long long One64 = 0x4000400040004000;
---
>     long long One64 = 0x4000400040004000ULL;
1210c1260,1261
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         " punpcklwd     %%mm7,      %%mm7;           "
>         " punpckldq     %%mm7,      %%mm7;           "
1232c1283,1284
<         " pshufw    $0, %%mm1,      %%mm1;           "
---
>         " punpcklwd     %%mm1,      %%mm1;           "
>         " punpckldq     %%mm1,      %%mm1;           "
1241,1242c1293,1310
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
<         " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
---
>         " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
>         " psraw           $15,      %%mm0;           "
>         " pand          %%mm3,      %%mm0;           "
>         " movq          %%mm3,      %%mm2;           "
>         " psraw           $15,      %%mm2;           "
>         " pand          %%mm4,      %%mm2;           "
>         " pmulhw        %%mm4,      %%mm3;           "
>         " paddw         %%mm0,      %%mm3;           "
>         " paddw         %%mm2,      %%mm3;           "
>         " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
>         " psraw           $15,      %%mm0;           "
>         " pand          %%mm4,      %%mm0;           "
>         " movq          %%mm4,      %%mm2;           "
>         " psraw           $15,      %%mm2;           "
>         " pand          %%mm1,      %%mm2;           "
>         " pmulhw        %%mm1,      %%mm4;           "
>         " paddw         %%mm0,      %%mm4;           "
>         " paddw         %%mm2,      %%mm4;           "
1246c1314,1323
<         " pmulhuw       %%mm7,      %%mm4;           "
---
>         " movq          %%mm7,      %%mm0;           "
>         " psraw           $15,      %%mm0;           "
>         " pand          %%mm4,      %%mm0;           "
>         " movq          %%mm4,      %%mm2;           "
>         " psraw           $15,      %%mm2;           "
>         " pand          %%mm7,      %%mm2;           "
>         " pmulhw        %%mm7,      %%mm4;           "
>         " paddw         %%mm0,      %%mm4;           "
>         " paddw         %%mm2,      %%mm4;           "
>         " pxor          %%mm0,      %%mm0;           "
1270c1347,1348
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         " punpcklwd     %%mm7,      %%mm7;           "
>         " punpckldq     %%mm7,      %%mm7;           "
1292c1370,1371
<         " pshufw    $0, %%mm1,      %%mm1;           "
---
>         " punpcklwd     %%mm1,      %%mm1;           "
>         " punpckldq     %%mm1,      %%mm1;           "
1301,1302c1380,1397
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
<         " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
---
>         " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
>         " psraw           $15,      %%mm0;           "
>         " pand          %%mm3,      %%mm0;           "
>         " movq          %%mm3,      %%mm2;           "
>         " psraw           $15,      %%mm2;           "
>         " pand          %%mm4,      %%mm2;           "
>         " pmulhw        %%mm4,      %%mm3;           "
>         " paddw         %%mm0,      %%mm3;           "
>         " paddw         %%mm2,      %%mm3;           "
>         " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
>         " psraw           $15,      %%mm0;           "
>         " pand          %%mm4,      %%mm0;           "
>         " movq          %%mm4,      %%mm2;           "
>         " psraw           $15,      %%mm2;           "
>         " pand          %%mm1,      %%mm2;           "
>         " pmulhw        %%mm1,      %%mm4;           "
>         " paddw         %%mm0,      %%mm4;           "
>         " paddw         %%mm2,      %%mm4;           "
1306c1401,1410
<         " pmulhuw       %%mm7,      %%mm4;           "
---
>         " movq          %%mm7,      %%mm0;           "
>         " psraw           $15,      %%mm0;           "
>         " pand          %%mm4,      %%mm0;           "
>         " movq          %%mm4,      %%mm2;           "
>         " psraw           $15,      %%mm2;           "
>         " pand          %%mm7,      %%mm2;           "
>         " pmulhw        %%mm7,      %%mm4;           "
>         " paddw         %%mm0,      %%mm4;           "
>         " paddw         %%mm2,      %%mm4;           "
>         " pxor          %%mm0,      %%mm0;           "
1535,1536c1639,1642
<              " pshufw      $0, %%mm1, %%mm1;                      "
<              " pshufw      $0, %%mm2, %%mm2;                      "
---
>              " punpcklwd  %%mm1,      %%mm1;                      "
>              " punpckldq  %%mm1,      %%mm1;                      "
>              " punpcklwd  %%mm2,      %%mm2;                      "
>              " punpckldq  %%mm2,      %%mm2;                      "
1564,1565c1670,1673
<              " pshufw      $0, %%mm1, %%mm1;                      "
<              " pshufw      $0, %%mm2, %%mm2;                      "
---
>              " punpcklwd  %%mm1,      %%mm1;                      "
>              " punpckldq  %%mm1,      %%mm1;                      "
>              " punpcklwd  %%mm2,      %%mm2;                      "
>              " punpckldq  %%mm2,      %%mm2;                      "