[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

Re: [pygame] transform module not MMX



Yes, filter_shrink_Y_MMX was giving me concern. But I think this new replacement patch catches all the SSE instructions. I tested it with test/transform_test.py and examples/scaletest.py . Shrinking works. I have built but not tested it with SSE instructions enables, the default build. The disassembled code looks correct. My only concern I have is that my use of the stack pointer esp in the PMULHUW macro is compatible across operating systems.

René Dudfield wrote:
ah,

cool.  So the filter_shrink_Y_MMX function still needs to be fixed?

On 8/15/07, Lenard Lindstrom <len-l@xxxxxxxxx> wrote:
Lenard Lindstrom wrote:
Lenard Lindstrom wrote:
The assembler code in the transform module is not MMX as advertised.
It contains at least one SSE instruction: PSHUFW .

This is a patch for SVN transform.c that removes the offensive SSE
instructions when the macro NO_SSE is defined.

Never mind. I overlooked the PMULHUW SSE instructions in
filter_shrink_Y_MMX .

--
Lenard Lindstrom
<len-l@xxxxxxxxx>




--
Lenard Lindstrom
<len-l@xxxxxxxxx>

31a32,64
> /*  mmx/sse assembly sequence to dumplicate the first word in an mmx register. */
> #if defined(NO_SSE)
> 
> #define PSHUFW_0(mmxregno)   \
>   " punpcklwd        %%mm" #mmxregno ", %%mm" #mmxregno ";"   \
>   " punpckldq        %%mm" #mmxregno ", %%mm" #mmxregno ";"
> 
> #define PMULHUW(mmxregno_s, mmxregno_d)  \
>   " movq       %%mm" #mmxregno_d ",            -16(%%esp);"  \
>   " psraw                      $15,   %%mm" #mmxregno_d ";"  \
>   " pand       %%mm" #mmxregno_s ",   %%mm" #mmxregno_d ";"  \
>   " movq       %%mm" #mmxregno_d ",             -8(%%esp);"  \
>   " movq                -16(%%esp),   %%mm" #mmxregno_d ";"  \
>   " pmulhw     %%mm" #mmxregno_s ",   %%mm" #mmxregno_d ";"  \
>   " paddw                -8(%%esp),   %%mm" #mmxregno_d ";"  \
>   " movq       %%mm" #mmxregno_d ",             -8(%%esp);"  \
>   " pxor       %%mm" #mmxregno_d ",   %%mm" #mmxregno_d ";"  \
>   " por        %%mm" #mmxregno_s ",   %%mm" #mmxregno_d ";"  \
>   " psraw                      $15,   %%mm" #mmxregno_d ";"  \
>   " pand                -16(%%esp),   %%mm" #mmxregno_d ";"  \
>   " paddw                -8(%%esp),   %%mm" #mmxregno_d ";"
> 
> #else
> 
> #define PSHUFW_0(mmxregno)   \
>   " pshufw       $0, %%mm" #mmxregno ", %%mm" #mmxregno ";"
> 
> #define PMULHUW(mmxregno_s, mmxregno_d)  \
>   " pmulhuw    %%mm" #mmxregno_s ",   %%mm" #mmxregno_d ";"
> 
> #endif  /* defined(NO_SEE) */
> 
> 
1026c1059
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
> 	PSHUFW_0(7)
1043c1076
<         " pshufw    $0, %%mm2,      %%mm2;           "
---
> 	PSHUFW_0(2)
1049,1050c1082,1083
<         " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
---
>         PMULHUW(4, 2)                                  /* mm2 = (srcpix * xcounter >> 16) */
>         PMULHUW(4, 3)                                  /* mm3 = (srcpix * xfrac) >> 16) */
1053c1086
<         " pmulhuw       %%mm7,      %%mm2;           "
---
>         PMULHUW(7, 2)
1077c1110
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         PSHUFW_0(7)
1094c1127
<         " pshufw    $0, %%mm2,      %%mm2;           "
---
>         PSHUFW_0(2)
1100,1101c1133,1134
<         " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
---
>         PMULHUW(4, 2)                                  /* mm2 = (srcpix * xcounter >> 16) */
>         PMULHUW(4, 3)                                  /* mm3 = (srcpix * xfrac) >> 16) */
1104c1137
<         " pmulhuw       %%mm7,      %%mm2;           "
---
>         PMULHUW(7, 2)
1210c1243
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         PSHUFW_0(7)
1232c1265
<         " pshufw    $0, %%mm1,      %%mm1;           "
---
>         PSHUFW_0(1)
1241,1242c1274,1275
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
<         " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
---
> 	PMULHUW(4, 3)                                  /* mm3 = (srcpix * yfrac) >> 16 */
>         PMULHUW(1, 4)                                  /* mm4 = (srcpix * ycounter >> 16) */
1246c1279
<         " pmulhuw       %%mm7,      %%mm4;           "
---
> 	PMULHUW(7, 4)
1270c1303
<         " pshufw    $0, %%mm7,      %%mm7;           "
---
>         PSHUFW_0(7)
1292c1325
<         " pshufw    $0, %%mm1,      %%mm1;           "
---
>         PSHUFW_0(1)
1301,1302c1334,1335
<         " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
<         " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
---
> 	PMULHUW(4, 3)                                  /* mm3 = (srcpix * yfrac) >> 16 */
>         PMULHUW(1, 4)                                  /* mm4 = (srcpix * ycounter >> 16) */
1306c1339
<         " pmulhuw       %%mm7,      %%mm4;           "
---
> 	PMULHUW(7, 4)
1535,1536c1568,1569
<              " pshufw      $0, %%mm1, %%mm1;                      "
<              " pshufw      $0, %%mm2, %%mm2;                      "
---
>              PSHUFW_0(1)
>              PSHUFW_0(2)
1564,1565c1597,1598
<              " pshufw      $0, %%mm1, %%mm1;                      "
<              " pshufw      $0, %%mm2, %%mm2;                      "
---
>              PSHUFW_0(1)
>              PSHUFW_0(2)