[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[pygame] PATCH: enable MMX for X86_64, VideoCapture



Hello,

the attached patch and source files contain a workaround for building pygame with MMX scaling support for 64-bit Windows using Visual Studio and mingw-64-bit.

When compiling scale_mmx.c without -D_NO_MMX_FOR_X86_64 using the 64-bit Visual Studio compiler the file scale_mmx64_msvc.c is included, which has all asm code replaced by function calls. The asm containing functions are implemented in scale_mmx64_gcc.c, which can be compiled independently with mingw-64-bit:

x86_64-w64-mingw32-gcc.exe -c src/scale_mmx64_gcc.c -fno-leading-underscore -o obj/win64/scale_mmx64_gcc.obj

The resulting scale_mmx64_gcc.obj is then linked during the MSVC build process.

The patch should not interfere with other compilers (not verified). It is just a workaround until mingw-64-bit has support for using the msvc9 runtime. It is probably not a good idea to apply the patch to the main trunk since it duplicates code.

The resulting 1.9.2pre binaries pass all tests, except freetype_font_test.

The patch also tries to import vidcap from the VideoCapture package.

Christoph


Index: src/scale_mmx.c
===================================================================
--- src/scale_mmx.c	(revision 2654)
+++ src/scale_mmx.c	(working copy)
@@ -35,3 +35,7 @@
 #       include "scale_mmx32.c"
 #   endif
 #endif
+
+#if defined(_M_X64) && !defined(_NO_MMX_FOR_X86_64)
+#include "scale_mmx64_msvc.c"
+#endif
\ No newline at end of file
Index: src/scale.h
===================================================================
--- src/scale.h	(revision 2654)
+++ src/scale.h	(working copy)
@@ -56,6 +56,18 @@
 
 void filter_expand_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight);
 
+#if defined(_M_X64)
+
+void filter_shrink_Y_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight);
+
+void filter_shrink_Y_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight);
+
+void filter_expand_X_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void filter_expand_X_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+#endif /* #if defined(_M_X64) */
+
 #endif /* #if (defined(__GNUC__) && .....) */
 
 #endif /* #if !defined(SCALE_HEADER) */
Index: lib/_camera_vidcapture.py
===================================================================
--- lib/_camera_vidcapture.py	(revision 2654)
+++ lib/_camera_vidcapture.py	(working copy)
@@ -18,7 +18,10 @@
 
 def init():
     global vidcap
-    import vidcap as vc
+    try:
+        import vidcap as vc
+    except ImportError:
+        from VideoCapture import vidcap as vc
     vidcap = vc
 
 def quit():
Index: setup.py
===================================================================
--- setup.py	(revision 2654)
+++ setup.py	(working copy)
@@ -360,15 +360,17 @@
     def replace_scale_mmx():
         for e in extensions:
             if e.name == 'transform':
-                e.extra_objects.append(
-                    os.path.join('obj', 'win32', 'scale_mmx.obj'))
-                for i in range(len(e.sources)):
-                    if e.sources[i].endswith('scale_mmx.c'):
-                        del e.sources[i]
-                        return
-    # linking to 64-bit mingw generated scale_mmx.obj fails
-    if not '64 bit' in sys.version:
-        replace_scale_mmx()
+                if '64 bit' in sys.version:
+                    e.extra_objects.append(
+                        os.path.join('obj', 'win64', 'scale_mmx64_gcc.obj'))
+                else:
+                    e.extra_objects.append(
+                        os.path.join('obj', 'win32', 'scale_mmx.obj'))
+                    for i in range(len(e.sources)):
+                        if e.sources[i].endswith('scale_mmx.c'):
+                            del e.sources[i]
+                            return
+    replace_scale_mmx()
 
 
 #clean up the list of extensions
/*
  pygame - Python Game Library
  Copyright (C) 2000-2001  Pete Shinners
  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Library General Public
  License as published by the Free Software Foundation; either
  version 2 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Library General Public License for more details.

  You should have received a copy of the GNU Library General Public
  License along with this library; if not, write to the Free
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Pete Shinners
  pete@xxxxxxxxxxxx
*/

/* Pentium 64 bit SSE/MMX smoothscale routines
 * These are written for compilation with GCC only.
 *
 * This file should not depend on anything.
 */

#if !defined(__GNUC__)
#error "Pygame build bug: should not be compiling this file!"
#endif

#include <stdint.h>
typedef uint8_t Uint8;    /* SDL convension */
typedef uint16_t Uint16;  /* SDL convension */
#include <stdlib.h>
#include <memory.h>
#include "scale.h"


/* These functions implement an area-averaging shrinking filter in the X-dimension.
 */
void
filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
    int srcdiff = srcpitch - (srcwidth * 4);
    int dstdiff = dstpitch - (dstwidth * 4);

    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
    int xrecip = 0x40000000 / xspace;
    long long One64 = 0x4000400040004000ULL;
    long long srcdiff64 = srcdiff;
    long long dstdiff64 = dstdiff;
    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
        " pxor          %%mm0,      %%mm0;           "
        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
        " punpcklwd     %%mm7,      %%mm7;           "
        " punpckldq     %%mm7,      %%mm7;           "
        "1:                                          " /* outer Y-loop */
        " movl             %5,      %%ecx;           " /* ecx == xcounter */
        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
        " movl             %4,      %%edx;           " /* edx == width */
        "2:                                          " /* inner X-loop */
        " cmpl        $0x4000,      %%ecx;           "
        " jbe              3f;                       "
        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
        " add              $4,         %0;           "
        " punpcklbw     %%mm0,      %%mm2;           "
        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
        " subl        $0x4000,      %%ecx;           "
        " jmp              4f;                       "
        "3:                                          " /* prepare to output a pixel */
        " movd          %%ecx,      %%mm2;           "
        " movq             %2,      %%mm3;           " /* mm3 = 2^14  */
        " punpcklwd     %%mm2,      %%mm2;           "
        " punpckldq     %%mm2,      %%mm2;           "
        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
        " add              $4,         %0;           "
        " punpcklbw     %%mm0,      %%mm4;           "
        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
        " psllw            $2,      %%mm4;           "
        " movq          %%mm4,      %%mm5;           " /* mm2 = (srcpix * xcounter >> 16) */
        " psraw           $15,      %%mm5;           "
        " pand          %%mm2,      %%mm5;           "
        " movq          %%mm2,      %%mm6;           "
        " psraw           $15,      %%mm6;           "
        " pand          %%mm4,      %%mm6;           "
        " pmulhw        %%mm4,      %%mm2;           "
        " paddw         %%mm5,      %%mm2;           "
        " paddw         %%mm6,      %%mm2;           "
        " movq          %%mm4,      %%mm5;           " /* mm3 = (srcpix * xfrac) >> 16) */
        " psraw           $15,      %%mm5;           "
        " pand          %%mm3,      %%mm5;           "
        " movq          %%mm3,      %%mm6;           "
        " psraw           $15,      %%mm6;           "
        " pand          %%mm4,      %%mm6;           "
        " pmulhw        %%mm4,      %%mm3;           "
        " paddw         %%mm5,      %%mm3;           "
        " paddw         %%mm6,      %%mm3;           "
        " paddw         %%mm1,      %%mm2;           "
        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
        " movq          %%mm7,      %%mm5;           "
        " psraw           $15,      %%mm5;           "
        " pand          %%mm2,      %%mm5;           "
        " movq          %%mm2,      %%mm6;           "
        " psraw           $15,      %%mm6;           "
        " pand          %%mm7,      %%mm6;           "
        " pmulhw        %%mm7,      %%mm2;           "
        " paddw         %%mm5,      %%mm2;           "
        " paddw         %%mm6,      %%mm2;           "
        " packuswb      %%mm0,      %%mm2;           "
        " movd          %%mm2,       (%1);           "
        " add              %5,      %%ecx;           "
        " add              $4,         %1;           "
        " subl        $0x4000,      %%ecx;           "
        "4:                                          " /* tail of inner X-loop */
        " decl          %%edx;                       "
        " jne              2b;                       "
        " add              %7,         %0;           " /* srcpix += srcdiff */
        " add              %8,         %1;           " /* dstpix += dstdiff */
        " decl             %3;                       "
        " jne              1b;                       "
        " emms;                                      "
        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
        : "m"(One64),   "m"(height), "m"(srcwidth),
          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
        : "%ecx","%edx"               /* clobbered */
        );
}

void
filter_shrink_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
    int srcdiff = srcpitch - (srcwidth * 4);
    int dstdiff = dstpitch - (dstwidth * 4);

    int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
    int xrecip = 0x40000000 / xspace;
    long long One64 = 0x4000400040004000ULL;
    long long srcdiff64 = srcdiff;
    long long dstdiff64 = dstdiff;
    asm __volatile__(" /* MMX code for X-shrink area average filter */ "
        " pxor          %%mm0,      %%mm0;           "
        " movd             %6,      %%mm7;           " /* mm7 == xrecipmmx */
        " movq             %2,      %%mm6;           " /* mm6 = 2^14  */
        " pshufw    $0, %%mm7,      %%mm7;           "
        "1:                                          " /* outer Y-loop */
        " movl             %5,      %%ecx;           " /* ecx == xcounter */
        " pxor          %%mm1,      %%mm1;           " /* mm1 == accumulator */
        " movl             %4,      %%edx;           " /* edx == width */
        "2:                                          " /* inner X-loop */
        " cmpl        $0x4000,      %%ecx;           "
        " jbe              3f;                       "
        " movd           (%0),      %%mm2;           " /* mm2 = srcpix */
        " add              $4,         %0;           "
        " punpcklbw     %%mm0,      %%mm2;           "
        " paddw         %%mm2,      %%mm1;           " /* accumulator += srcpix */
        " subl        $0x4000,      %%ecx;           "
        " jmp              4f;                       "
        "3:                                          " /* prepare to output a pixel */
        " movd          %%ecx,      %%mm2;           "
        " movq          %%mm6,      %%mm3;           " /* mm3 = 2^14  */
        " pshufw    $0, %%mm2,      %%mm2;           "
        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
        " add              $4,         %0;           "
        " punpcklbw     %%mm0,      %%mm4;           "
        " psubw         %%mm2,      %%mm3;           " /* mm3 = xfrac */
        " psllw            $2,      %%mm4;           "
        " pmulhuw       %%mm4,      %%mm2;           " /* mm2 = (srcpix * xcounter >> 16) */
        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * xfrac) >> 16 */
        " paddw         %%mm1,      %%mm2;           "
        " movq          %%mm3,      %%mm1;           " /* accumulator = (srcpix * xfrac) >> 16 */
        " pmulhuw       %%mm7,      %%mm2;           "
        " packuswb      %%mm0,      %%mm2;           "
        " movd          %%mm2,       (%1);           "
        " add              %5,      %%ecx;           "
        " add              $4,         %1;           "
        " subl        $0x4000,      %%ecx;           "
        "4:                                          " /* tail of inner X-loop */
        " decl          %%edx;                       "
        " jne              2b;                       "
        " add              %7,         %0;           " /* srcpix += srcdiff */
        " add              %8,         %1;           " /* dstpix += dstdiff */
        " decl             %3;                       "
        " jne              1b;                       "
        " emms;                                      "
        : "+r"(srcpix), "+r"(dstpix)  /* outputs */
        : "m"(One64),   "m"(height), "m"(srcwidth),
          "m"(xspace),  "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64)     /* inputs */
        : "%ecx","%edx"               /* clobbered */
        );
}

/* These functions implement an area-averaging shrinking filter in the Y-dimension.
 */
void
filter_shrink_Y_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
    //Uint16 *templine;
    int srcdiff = srcpitch - (width * 4);
    int dstdiff = dstpitch - (width * 4);
    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
    int yrecip = 0x40000000 / yspace;
    long long One64 = 0x4000400040004000ULL;

    long long srcdiff64 = srcdiff;
    long long dstdiff64 = dstdiff;
    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
        " movl             %5,      %%ecx;           " /* ecx == ycounter */
        " pxor          %%mm0,      %%mm0;           "
        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
        " punpcklwd     %%mm7,      %%mm7;           "
        " punpckldq     %%mm7,      %%mm7;           "
        "1:                                          " /* outer Y-loop */
        " mov              %2,      %%rax;           " /* rax == accumulate */
        " cmpl        $0x4000,      %%ecx;           "
        " jbe              3f;                       "
        " movl             %4,      %%edx;           " /* edx == width */
        "2:                                          "
        " movd           (%0),      %%mm1;           "
        " add              $4,         %0;           "
        " movq        (%%rax),      %%mm2;           "
        " punpcklbw     %%mm0,      %%mm1;           "
        " paddw         %%mm1,      %%mm2;           "
        " movq          %%mm2,    (%%rax);           "
        " add              $8,      %%rax;           "
        " decl          %%edx;                       "
        " jne              2b;                       "
        " subl        $0x4000,      %%ecx;           "
        " jmp              6f;                       "
        "3:                                          " /* prepare to output a line */
        " movd          %%ecx,      %%mm1;           "
        " movl             %4,      %%edx;           " /* edx = width */
        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
        " punpcklwd     %%mm1,      %%mm1;           "
        " punpckldq     %%mm1,      %%mm1;           "
        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
        "4:                                          "
        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
        " add              $4,         %0;           "
        " punpcklbw     %%mm0,      %%mm4;           "
        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
        " movq          %%mm6,      %%mm3;           "
        " psllw            $2,      %%mm4;           "
        " movq          %%mm4,      %%mm0;           " /* mm3 = (srcpix * yfrac) >> 16) */
        " psraw           $15,      %%mm0;           "
        " pand          %%mm3,      %%mm0;           "
        " movq          %%mm3,      %%mm2;           "
        " psraw           $15,      %%mm2;           "
        " pand          %%mm4,      %%mm2;           "
        " pmulhw        %%mm4,      %%mm3;           "
        " paddw         %%mm0,      %%mm3;           "
        " paddw         %%mm2,      %%mm3;           "
        " movq          %%mm1,      %%mm0;           " /* mm4 = (srcpix * ycounter >> 16) */
        " psraw           $15,      %%mm0;           "
        " pand          %%mm4,      %%mm0;           "
        " movq          %%mm4,      %%mm2;           "
        " psraw           $15,      %%mm2;           "
        " pand          %%mm1,      %%mm2;           "
        " pmulhw        %%mm1,      %%mm4;           "
        " paddw         %%mm0,      %%mm4;           "
        " paddw         %%mm2,      %%mm4;           "
        " movq          %%mm3,    (%%rax);           "
        " paddw         %%mm5,      %%mm4;           "
        " add              $8,      %%rax;           "
        " movq          %%mm7,      %%mm0;           "
        " psraw           $15,      %%mm0;           "
        " pand          %%mm4,      %%mm0;           "
        " movq          %%mm4,      %%mm2;           "
        " psraw           $15,      %%mm2;           "
        " pand          %%mm7,      %%mm2;           "
        " pmulhw        %%mm7,      %%mm4;           "
        " paddw         %%mm0,      %%mm4;           "
        " paddw         %%mm2,      %%mm4;           "
        " pxor          %%mm0,      %%mm0;           "
        " packuswb      %%mm0,      %%mm4;           "
        " movd          %%mm4,       (%1);           "
        " add              $4,         %1;           "
        " decl          %%edx;                       "
        " jne              4b;                       "
        " add              %8,         %1;           " /* dstpix += dstdiff */
        " addl             %5,      %%ecx;           "
        " subl        $0x4000,      %%ecx;           "
        "6:                                          " /* tail of outer Y-loop */
        " add              %7,         %0;           " /* srcpix += srcdiff */
        " decl             %3;                       "
        " jne              1b;                       "
        " emms;                                      "
        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
        : "%ecx","%edx","%rax"          /* clobbered */
        );

}

void
filter_shrink_Y_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
    int srcdiff = srcpitch - (width * 4);
    int dstdiff = dstpitch - (width * 4);
    int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
    int yrecip = 0x40000000 / yspace;
    long long One64 = 0x4000400040004000ULL;

    long long srcdiff64 = srcdiff;
    long long dstdiff64 = dstdiff;
    asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
        " movl             %5,      %%ecx;           " /* ecx == ycounter */
        " pxor          %%mm0,      %%mm0;           "
        " movd             %6,      %%mm7;           " /* mm7 == yrecipmmx */
        " pshufw    $0, %%mm7,      %%mm7;           "
        "1:                                          " /* outer Y-loop */
        " mov              %2,      %%rax;           " /* rax == accumulate */
        " cmpl        $0x4000,      %%ecx;           "
        " jbe              3f;                       "
        " movl             %4,      %%edx;           " /* edx == width */
        "2:                                          "
        " movd           (%0),      %%mm1;           "
        " add              $4,         %0;           "
        " movq        (%%rax),      %%mm2;           "
        " punpcklbw     %%mm0,      %%mm1;           "
        " paddw         %%mm1,      %%mm2;           "
        " movq          %%mm2,    (%%rax);           "
        " add              $8,      %%rax;           "
        " decl          %%edx;                       "
        " jne              2b;                       "
        " subl        $0x4000,      %%ecx;           "
        " jmp              6f;                       "
        "3:                                          " /* prepare to output a line */
        " movd          %%ecx,      %%mm1;           "
        " movl             %4,      %%edx;           " /* edx = width */
        " movq             %9,      %%mm6;           " /* mm6 = 2^14  */
        " pshufw    $0, %%mm1,      %%mm1;           "
        " psubw         %%mm1,      %%mm6;           " /* mm6 = yfrac */
        "4:                                          "
        " movd           (%0),      %%mm4;           " /* mm4 = srcpix */
        " add              $4,         %0;           "
        " punpcklbw     %%mm0,      %%mm4;           "
        " movq        (%%rax),      %%mm5;           " /* mm5 = accumulate */
        " movq          %%mm6,      %%mm3;           "
        " psllw            $2,      %%mm4;           "
        " pmulhuw       %%mm4,      %%mm3;           " /* mm3 = (srcpix * yfrac) >> 16 */
        " pmulhuw       %%mm1,      %%mm4;           " /* mm4 = (srcpix * ycounter >> 16) */
        " movq          %%mm3,    (%%rax);           "
        " paddw         %%mm5,      %%mm4;           "
        " add              $8,      %%rax;           "
        " pmulhuw       %%mm7,      %%mm4;           "
        " packuswb      %%mm0,      %%mm4;           "
        " movd          %%mm4,       (%1);           "
        " add              $4,         %1;           "
        " decl          %%edx;                       "
        " jne              4b;                       "
        " add              %8,         %1;           " /* dstpix += dstdiff */
        " addl             %5,      %%ecx;           "
        " subl        $0x4000,      %%ecx;           "
        "6:                                          " /* tail of outer Y-loop */
        " add              %7,         %0;           " /* srcpix += srcdiff */
        " decl             %3;                       "
        " jne              1b;                       "
        " emms;                                      "
        : "+r"(srcpix), "+r"(dstpix)    /* outputs */
        : "m"(templine),"m"(srcheight), "m"(width),     "m"(yspace),  
          "m"(yrecip),  "m"(srcdiff64), "m"(dstdiff64), "m"(One64)  /* input */
        : "%ecx","%edx","%rax"          /* clobbered */
        );
}

/* These functions implement a bilinear filter in the X-dimension.
 */
void
filter_expand_X_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
    int x, y;
    int factorwidth = 8;

    /* Create multiplier factors and starting indices and put them in arrays */
    for (x = 0; x < dstwidth; x++)
    {
        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
        int xm0 = 0x100 - xm1;
        xidx0[x] = x * (srcwidth - 1) / dstwidth;
        xmult1[x*2]   = xm1 | (xm1 << 16);
        xmult1[x*2+1] = xm1 | (xm1 << 16);
        xmult0[x*2]   = xm0 | (xm0 << 16);
        xmult0[x*2+1] = xm0 | (xm0 << 16);
    }

    /* Do the scaling in raster order so we don't trash the cache */
    for (y = 0; y < height; y++)
    {
        Uint8 *srcrow0 = srcpix + y * srcpitch;
        Uint8 *dstrow = dstpix + y * dstpitch;
        int *xm0 = xmult0;
		int *xm1 = xmult1;
        int *x0 = xidx0;
        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
             " movl             %5,      %%ecx;           "
             " pxor          %%mm0,      %%mm0;           "
             "1:                                          "
             " movsxl         (%3),      %%rax;           " /* get xidx0[x] */
             " add              $4,         %3;           "
             " movq           (%0),      %%mm1;           " /* load mult0 */
             " add              $8,         %0;           "
             " movq           (%1),      %%mm2;           " /* load mult1 */
             " add              $8,         %1;           "
             " movd   (%4,%%rax,4),      %%mm4;           "
             " movd  4(%4,%%rax,4),      %%mm5;           "
             " punpcklbw     %%mm0,      %%mm4;           "
             " punpcklbw     %%mm0,      %%mm5;           "
             " pmullw        %%mm1,      %%mm4;           "
             " pmullw        %%mm2,      %%mm5;           "
             " paddw         %%mm4,      %%mm5;           "
             " psrlw            $8,      %%mm5;           "
             " packuswb      %%mm0,      %%mm5;           "
             " movd          %%mm5,       (%2);           "
             " add              $4,         %2;           "
             " decl          %%ecx;                       "
             " jne              1b;                       "
             " emms;                                      "
             : "+r"(xm0),   "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
             : "r"(srcrow0),"m"(dstwidth)  /* input */
             : "%ecx","%rax"                /* clobbered */
             );
    }

}

void
filter_expand_X_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
    int x, y;
    int factorwidth = 8;

    /* Create multiplier factors and starting indices and put them in arrays */
    for (x = 0; x < dstwidth; x++)
    {
        int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
        int xm0 = 0x100 - xm1;
        xidx0[x] = x * (srcwidth - 1) / dstwidth;
        xmult1[x*2]   = xm1 | (xm1 << 16);
        xmult1[x*2+1] = xm1 | (xm1 << 16);
        xmult0[x*2]   = xm0 | (xm0 << 16);
        xmult0[x*2+1] = xm0 | (xm0 << 16);
    }

    /* Do the scaling in raster order so we don't trash the cache */
    for (y = 0; y < height; y++)
    {
        Uint8 *srcrow0 = srcpix + y * srcpitch;
        Uint8 *dstrow = dstpix + y * dstpitch;
        int *xm0 = xmult0;
		int *xm1 = xmult1;
        int *x0 = xidx0;
        asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
             " movl             %5,      %%ecx;           "
             " pxor          %%mm0,      %%mm0;           "
             "1:                                          "
             " movsxl         (%3),      %%rax;           " /* get xidx0[x] */
             " add              $4,         %3;           "
             " movq           (%0),      %%mm1;           " /* load mult0 */
             " add              $8,         %0;           "
             " movq           (%1),      %%mm2;           " /* load mult1 */
             " add              $8,         %1;           "
             " movd   (%4,%%rax,4),      %%mm4;           "
             " movd  4(%4,%%rax,4),      %%mm5;           "
             " punpcklbw     %%mm0,      %%mm4;           "
             " punpcklbw     %%mm0,      %%mm5;           "
             " pmullw        %%mm1,      %%mm4;           "
             " pmullw        %%mm2,      %%mm5;           "
             " paddw         %%mm4,      %%mm5;           "
             " psrlw            $8,      %%mm5;           "
             " packuswb      %%mm0,      %%mm5;           "
             " movd          %%mm5,       (%2);           "
             " add              $4,         %2;           "
             " decl          %%ecx;                       "
             " jne              1b;                       "
             " emms;                                      "
             : "+r"(xm0),   "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
             : "r"(srcrow0),"m"(dstwidth)  /* input */
             : "%ecx","%rax"                /* clobbered */
             );
    }

}

/* These functions implement a bilinear filter in the Y-dimension
 */
void
filter_expand_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
    int y;

    for (y = 0; y < dstheight; y++)
    {
        int yidx0 = y * (srcheight - 1) / dstheight;
        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
        Uint8 *srcrow1 = srcrow0 + srcpitch;
        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
        int ymult0 = 0x0100 - ymult1;
        Uint8 *dstrow = dstpix + y * dstpitch;
        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
             " movl          %5,      %%ecx;                      "
             " movd          %3,      %%mm1;                      "
             " movd          %4,      %%mm2;                      "
             " pxor       %%mm0,      %%mm0;                      "
             " punpcklwd  %%mm1,      %%mm1;                      "
             " punpckldq  %%mm1,      %%mm1;                      "
             " punpcklwd  %%mm2,      %%mm2;                      "
             " punpckldq  %%mm2,      %%mm2;                      "
             "1:                                                  "
             " movd        (%0),      %%mm4;                      "
             " add           $4,         %0;                      "
             " movd        (%1),      %%mm5;                      "
             " add           $4,         %1;                      "
             " punpcklbw  %%mm0,      %%mm4;                      "
             " punpcklbw  %%mm0,      %%mm5;                      "
             " pmullw     %%mm1,      %%mm4;                      "
             " pmullw     %%mm2,      %%mm5;                      "
             " paddw      %%mm4,      %%mm5;                      "
             " psrlw         $8,      %%mm5;                      "
             " packuswb   %%mm0,      %%mm5;                      "
             " movd       %%mm5,       (%2);                      "
             " add           $4,         %2;                      "
             " decl       %%ecx;                                  "
             " jne           1b;                                  "
             " emms;                                              "
             : "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow)   /* outputs */
             : "m"(ymult0),   "m"(ymult1),   "m"(width)    /* input */
             : "%ecx"         /* clobbered */
             );
    }
}

void
filter_expand_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
    int y;

    for (y = 0; y < dstheight; y++)
    {
        int yidx0 = y * (srcheight - 1) / dstheight;
        Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
        Uint8 *srcrow1 = srcrow0 + srcpitch;
        int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
        int ymult0 = 0x0100 - ymult1;
        Uint8 *dstrow = dstpix + y * dstpitch;
        asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
             " movl          %5,      %%ecx;                      "
             " movd          %3,      %%mm1;                      "
             " movd          %4,      %%mm2;                      "
             " pxor       %%mm0,      %%mm0;                      "
             " pshufw      $0, %%mm1, %%mm1;                      "
             " pshufw      $0, %%mm2, %%mm2;                      "
             "1:                                                  "
             " movd        (%0),      %%mm4;                      "
             " add           $4,         %0;                      "
             " movd        (%1),      %%mm5;                      "
             " add           $4,         %1;                      "
             " punpcklbw  %%mm0,      %%mm4;                      "
             " punpcklbw  %%mm0,      %%mm5;                      "
             " pmullw     %%mm1,      %%mm4;                      "
             " pmullw     %%mm2,      %%mm5;                      "
             " paddw      %%mm4,      %%mm5;                      "
             " psrlw         $8,      %%mm5;                      "
             " packuswb   %%mm0,      %%mm5;                      "
             " movd       %%mm5,       (%2);                      "
             " add           $4,         %2;                      "
             " decl       %%ecx;                                  "
             " jne           1b;                                  "
             " emms;                                              "
             : "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow)   /* outputs */
             : "m"(ymult0),   "m"(ymult1),   "m"(width)    /* input */
             : "%ecx"         /* clobbered */
             );
    }
}

/*
  pygame - Python Game Library
  Copyright (C) 2000-2001  Pete Shinners
  Copyright (C) 2007  Rene Dudfield, Richard Goedeken 

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Library General Public
  License as published by the Free Software Foundation; either
  version 2 of the License, or (at your option) any later version.

  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Library General Public License for more details.

  You should have received a copy of the GNU Library General Public
  License along with this library; if not, write to the Free
  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

  Pete Shinners
  pete@xxxxxxxxxxxx
*/

/* Pentium 64 bit SSE/MMX smoothscale routines
 * These are written for compilation with MSVC only.
 *
 * This file should not depend on anything but the C standard library.
 */

#include <stdint.h>
typedef uint8_t Uint8;    /* SDL convension */
typedef uint16_t Uint16;  /* SDL convension */
#include <stdlib.h>
#include <memory.h>
#include "scale.h"

/* These functions implement an area-averaging shrinking filter in the Y-dimension.
 */
void
filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
    Uint16 *templine;

    /* allocate and clear a memory area for storing the accumulator line */
    templine = (Uint16 *) malloc(dstpitch * 2);
    if (templine == 0) return;
    memset(templine, 0, dstpitch * 2);
    
    filter_shrink_Y_MMX_gcc(srcpix, dstpix, templine, width, srcpitch, dstpitch, srcheight, dstheight);

    /* free the temporary memory */
    free(templine);
}

void
filter_shrink_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
    Uint16 *templine;

    /* allocate and clear a memory area for storing the accumulator line */
    templine = (Uint16 *) malloc(dstpitch * 2);
    if (templine == 0) return;
    memset(templine, 0, dstpitch * 2);
    
    filter_shrink_Y_SSE_gcc(srcpix, dstpix, templine, width, srcpitch, dstpitch, srcheight, dstheight);

    /* free the temporary memory */
    free(templine);
}

/* These functions implement a bilinear filter in the X-dimension.
 */
void
filter_expand_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
    int *xidx0, *xmult0, *xmult1;
    int factorwidth = 8;

    /* Allocate memory for factors */
    xidx0 = malloc(dstwidth * 4);
    if (xidx0 == 0) return;
    xmult0 = (int *) malloc(dstwidth * factorwidth);
    xmult1 = (int *) malloc(dstwidth * factorwidth);
    if (xmult0 == 0 || xmult1 == 0)
    {
        free(xidx0);
        if (xmult0) free(xmult0);
        if (xmult1) free(xmult1);
    }

    filter_expand_X_MMX_gcc(srcpix, dstpix, xidx0, xmult0, xmult1, height, srcpitch, dstpitch, srcwidth, dstwidth);

    /* free memory */
    free(xidx0);
    free(xmult0);
    free(xmult1);
}

void
filter_expand_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
    int *xidx0, *xmult0, *xmult1;
    int factorwidth = 8;

    /* Allocate memory for factors */
    xidx0 = malloc(dstwidth * 4);
    if (xidx0 == 0) return;
    xmult0 = (int *) malloc(dstwidth * factorwidth);
    xmult1 = (int *) malloc(dstwidth * factorwidth);
    if (xmult0 == 0 || xmult1 == 0)
    {
        free(xidx0);
        if (xmult0) free(xmult0);
        if (xmult1) free(xmult1);
    }

    filter_expand_X_SSE_gcc(srcpix, dstpix, xidx0, xmult0, xmult1, height, srcpitch, dstpitch, srcwidth, dstwidth);

    /* free memory */
    free(xidx0);
    free(xmult0);
    free(xmult1);
}