[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

Re: [pygame] New strict MMX patch for transform.c



Awesome.  Thanks :)

Committed revision 1048.


On 8/26/07, Lenard Lindstrom <len-l@xxxxxxxxx> wrote:
> I am including a new patch for transform.c . It replaces the patches I
> submitted earlier. It cleans up the assembly code and and omits the
> NO_SSE flag. Profiling shows that filter_shrink_X_MMX is still nearly 4X
> faster than filter_shrink_X_ONLYC. No extra tweaking, by interleaving
> instructions, was done since that showed a less that 1% performance
> change (below the margin of error for the profile).
>
> --
> Lenard Lindstrom
> <len-l@xxxxxxxxx>
>
>
> 1018c1018
> <     long long One64 = 0x4000400040004000;
> ---
> >     long long One64 = 0x4000400040004000ULL;
> 1025,1026c1025,1026
> <         " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
> <         " pshufw    $0, %%mm7,      %%mm7;           "
> ---
> >         " punpcklwd     %%mm7,      %%mm7;           "
> >         " punpckldq     %%mm7,      %%mm7;           "
> 1042,1043c1042,1044
> <         " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
> <         " pshufw    $0, %%mm2,      %%mm2;           "
> ---
> >         " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
> >         " punpcklwd     %%mm2,      %%mm2;           "
> >         " punpckldq     %%mm2,      %%mm2;           "
> 1049,1050c1050,1067
> <         " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
> <         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
> ---
> >         " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
> >         " psraw           $15,      %%mm5;           "
> >         " pand          %%mm2,      %%mm5;           "
> >         " movq          %%mm2,      %%mm6;           "
> >         " psraw           $15,      %%mm6;           "
> >         " pand          %%mm4,      %%mm6;           "
> >         " pmulhw        %%mm4,      %%mm2;           "
> >         " paddw         %%mm5,      %%mm2;           "
> >         " paddw         %%mm6,      %%mm2;           "
> >         " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
> >         " psraw           $15,      %%mm5;           "
> >         " pand          %%mm3,      %%mm5;           "
> >         " movq          %%mm3,      %%mm6;           "
> >         " psraw           $15,      %%mm6;           "
> >         " pand          %%mm4,      %%mm6;           "
> >         " pmulhw        %%mm4,      %%mm3;           "
> >         " paddw         %%mm5,      %%mm3;           "
> >         " paddw         %%mm6,      %%mm3;           "
> 1053c1070,1078
> <         " pmulhuw       %%mm7,      %%mm2;           "
> ---
> >         " movq          %%mm7,      %%mm5;           "
> >         " psraw           $15,      %%mm5;           "
> >         " pand          %%mm2,      %%mm5;           "
> >         " movq          %%mm2,      %%mm6;           "
> >         " psraw           $15,      %%mm6;           "
> >         " pand          %%mm7,      %%mm6;           "
> >         " pmulhw        %%mm7,      %%mm2;           "
> >         " paddw         %%mm5,      %%mm2;           "
> >         " paddw         %%mm6,      %%mm2;           "
> 1076,1077c1101,1102
> <         " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
> <         " pshufw    $0, %%mm7,      %%mm7;           "
> ---
> >         " punpcklwd     %%mm7,      %%mm7;           "
> >         " punpckldq     %%mm7,      %%mm7;           "
> 1093,1094c1118,1120
> <         " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
> <         " pshufw    $0, %%mm2,      %%mm2;           "
> ---
> >         " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
> >         " punpcklwd     %%mm2,      %%mm2;           "
> >         " punpckldq     %%mm2,      %%mm2;           "
> 1100,1101c1126,1143
> <         " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
> <         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
> ---
> >         " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
> >         " psraw           $15,      %%mm5;           "
> >         " pand          %%mm2,      %%mm5;           "
> >         " movq          %%mm2,      %%mm6;           "
> >         " psraw           $15,      %%mm6;           "
> >         " pand          %%mm4,      %%mm6;           "
> >         " pmulhw        %%mm4,      %%mm2;           "
> >         " paddw         %%mm5,      %%mm2;           "
> >         " paddw         %%mm6,      %%mm2;           "
> >         " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
> >         " psraw           $15,      %%mm5;           "
> >         " pand          %%mm3,      %%mm5;           "
> >         " movq          %%mm3,      %%mm6;           "
> >         " psraw           $15,      %%mm6;           "
> >         " pand          %%mm4,      %%mm6;           "
> >         " pmulhw        %%mm4,      %%mm3;           "
> >         " paddw         %%mm5,      %%mm3;           "
> >         " paddw         %%mm6,      %%mm3;           "
> 1104c1146,1154
> <         " pmulhuw       %%mm7,      %%mm2;           "
> ---
> >         " movq          %%mm7,      %%mm5;           "
> >         " psraw           $15,      %%mm5;           "
> >         " pand          %%mm2,      %%mm5;           "
> >         " movq          %%mm2,      %%mm6;           "
> >         " psraw           $15,      %%mm6;           "
> >         " pand          %%mm7,      %%mm6;           "
> >         " pmulhw        %%mm7,      %%mm2;           "
> >         " paddw         %%mm5,      %%mm2;           "
> >         " paddw         %%mm6,      %%mm2;           "
> 1202c1252
> <     long long One64 = 0x4000400040004000;
> ---
> >     long long One64 = 0x4000400040004000ULL;
> 1210c1260,1261
> <         " pshufw    $0, %%mm7,      %%mm7;           "
> ---
> >         " punpcklwd     %%mm7,      %%mm7;           "
> >         " punpckldq     %%mm7,      %%mm7;           "
> 1232c1283,1284
> <         " pshufw    $0, %%mm1,      %%mm1;           "
> ---
> >         " punpcklwd     %%mm1,      %%mm1;           "
> >         " punpckldq     %%mm1,      %%mm1;           "
> 1241,1242c1293,1310
> <         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
> <         " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
> ---
> >         " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
> >         " psraw           $15,      %%mm0;           "
> >         " pand          %%mm3,      %%mm0;           "
> >         " movq          %%mm3,      %%mm2;           "
> >         " psraw           $15,      %%mm2;           "
> >         " pand          %%mm4,      %%mm2;           "
> >         " pmulhw        %%mm4,      %%mm3;           "
> >         " paddw         %%mm0,      %%mm3;           "
> >         " paddw         %%mm2,      %%mm3;           "
> >         " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
> >         " psraw           $15,      %%mm0;           "
> >         " pand          %%mm4,      %%mm0;           "
> >         " movq          %%mm4,      %%mm2;           "
> >         " psraw           $15,      %%mm2;           "
> >         " pand          %%mm1,      %%mm2;           "
> >         " pmulhw        %%mm1,      %%mm4;           "
> >         " paddw         %%mm0,      %%mm4;           "
> >         " paddw         %%mm2,      %%mm4;           "
> 1246c1314,1323
> <         " pmulhuw       %%mm7,      %%mm4;           "
> ---
> >         " movq          %%mm7,      %%mm0;           "
> >         " psraw           $15,      %%mm0;           "
> >         " pand          %%mm4,      %%mm0;           "
> >         " movq          %%mm4,      %%mm2;           "
> >         " psraw           $15,      %%mm2;           "
> >         " pand          %%mm7,      %%mm2;           "
> >         " pmulhw        %%mm7,      %%mm4;           "
> >         " paddw         %%mm0,      %%mm4;           "
> >         " paddw         %%mm2,      %%mm4;           "
> >         " pxor          %%mm0,      %%mm0;           "
> 1270c1347,1348
> <         " pshufw    $0, %%mm7,      %%mm7;           "
> ---
> >         " punpcklwd     %%mm7,      %%mm7;           "
> >         " punpckldq     %%mm7,      %%mm7;           "
> 1292c1370,1371
> <         " pshufw    $0, %%mm1,      %%mm1;           "
> ---
> >         " punpcklwd     %%mm1,      %%mm1;           "
> >         " punpckldq     %%mm1,      %%mm1;           "
> 1301,1302c1380,1397
> <         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
> <         " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
> ---
> >         " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
> >         " psraw           $15,      %%mm0;           "
> >         " pand          %%mm3,      %%mm0;           "
> >         " movq          %%mm3,      %%mm2;           "
> >         " psraw           $15,      %%mm2;           "
> >         " pand          %%mm4,      %%mm2;           "
> >         " pmulhw        %%mm4,      %%mm3;           "
> >         " paddw         %%mm0,      %%mm3;           "
> >         " paddw         %%mm2,      %%mm3;           "
> >         " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
> >         " psraw           $15,      %%mm0;           "
> >         " pand          %%mm4,      %%mm0;           "
> >         " movq          %%mm4,      %%mm2;           "
> >         " psraw           $15,      %%mm2;           "
> >         " pand          %%mm1,      %%mm2;           "
> >         " pmulhw        %%mm1,      %%mm4;           "
> >         " paddw         %%mm0,      %%mm4;           "
> >         " paddw         %%mm2,      %%mm4;           "
> 1306c1401,1410
> <         " pmulhuw       %%mm7,      %%mm4;           "
> ---
> >         " movq          %%mm7,      %%mm0;           "
> >         " psraw           $15,      %%mm0;           "
> >         " pand          %%mm4,      %%mm0;           "
> >         " movq          %%mm4,      %%mm2;           "
> >         " psraw           $15,      %%mm2;           "
> >         " pand          %%mm7,      %%mm2;           "
> >         " pmulhw        %%mm7,      %%mm4;           "
> >         " paddw         %%mm0,      %%mm4;           "
> >         " paddw         %%mm2,      %%mm4;           "
> >         " pxor          %%mm0,      %%mm0;           "
> 1535,1536c1639,1642
> <              " pshufw      $0, %%mm1, %%mm1;                      "
> <              " pshufw      $0, %%mm2, %%mm2;                      "
> ---
> >              " punpcklwd  %%mm1,      %%mm1;                      "
> >              " punpckldq  %%mm1,      %%mm1;                      "
> >              " punpcklwd  %%mm2,      %%mm2;                      "
> >              " punpckldq  %%mm2,      %%mm2;                      "
> 1564,1565c1670,1673
> <              " pshufw      $0, %%mm1, %%mm1;                      "
> <              " pshufw      $0, %%mm2, %%mm2;                      "
> ---
> >              " punpcklwd  %%mm1,      %%mm1;                      "
> >              " punpckldq  %%mm1,      %%mm1;                      "
> >              " punpcklwd  %%mm2,      %%mm2;                      "
> >              " punpckldq  %%mm2,      %%mm2;                      "
>
>