[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[pygame] PATCH: enable MMX for X86_64, VideoCapture
Hello,
the attached patch and source files contain a workaround for building
pygame with MMX scaling support for 64-bit Windows using Visual Studio
and mingw-64-bit.
When compiling scale_mmx.c without -D_NO_MMX_FOR_X86_64 using the 64-bit
Visual Studio compiler the file scale_mmx64_msvc.c is included, which
has all asm code replaced by function calls. The asm containing
functions are implemented in scale_mmx64_gcc.c, which can be compiled
independently with mingw-64-bit:
x86_64-w64-mingw32-gcc.exe -c src/scale_mmx64_gcc.c
-fno-leading-underscore -o obj/win64/scale_mmx64_gcc.obj
The resulting scale_mmx64_gcc.obj is then linked during the MSVC build
process.
The patch should not interfere with other compilers (not verified). It
is just a workaround until mingw-64-bit has support for using the msvc9
runtime. It is probably not a good idea to apply the patch to the main
trunk since it duplicates code.
The resulting 1.9.2pre binaries pass all tests, except freetype_font_test.
The patch also tries to import vidcap from the VideoCapture package.
Christoph
Index: src/scale_mmx.c
===================================================================
--- src/scale_mmx.c (revision 2654)
+++ src/scale_mmx.c (working copy)
@@ -35,3 +35,7 @@
# include "scale_mmx32.c"
# endif
#endif
+
+#if defined(_M_X64) && !defined(_NO_MMX_FOR_X86_64)
+#include "scale_mmx64_msvc.c"
+#endif
\ No newline at end of file
Index: src/scale.h
===================================================================
--- src/scale.h (revision 2654)
+++ src/scale.h (working copy)
@@ -56,6 +56,18 @@
void filter_expand_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight);
+#if defined(_M_X64)
+
+void filter_shrink_Y_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight);
+
+void filter_shrink_Y_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight);
+
+void filter_expand_X_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+void filter_expand_X_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth);
+
+#endif /* #if defined(_M_X64) */
+
#endif /* #if (defined(__GNUC__) && .....) */
#endif /* #if !defined(SCALE_HEADER) */
Index: lib/_camera_vidcapture.py
===================================================================
--- lib/_camera_vidcapture.py (revision 2654)
+++ lib/_camera_vidcapture.py (working copy)
@@ -18,7 +18,10 @@
def init():
global vidcap
- import vidcap as vc
+ try:
+ import vidcap as vc
+ except ImportError:
+ from VideoCapture import vidcap as vc
vidcap = vc
def quit():
Index: setup.py
===================================================================
--- setup.py (revision 2654)
+++ setup.py (working copy)
@@ -360,15 +360,17 @@
def replace_scale_mmx():
for e in extensions:
if e.name == 'transform':
- e.extra_objects.append(
- os.path.join('obj', 'win32', 'scale_mmx.obj'))
- for i in range(len(e.sources)):
- if e.sources[i].endswith('scale_mmx.c'):
- del e.sources[i]
- return
- # linking to 64-bit mingw generated scale_mmx.obj fails
- if not '64 bit' in sys.version:
- replace_scale_mmx()
+ if '64 bit' in sys.version:
+ e.extra_objects.append(
+ os.path.join('obj', 'win64', 'scale_mmx64_gcc.obj'))
+ else:
+ e.extra_objects.append(
+ os.path.join('obj', 'win32', 'scale_mmx.obj'))
+ for i in range(len(e.sources)):
+ if e.sources[i].endswith('scale_mmx.c'):
+ del e.sources[i]
+ return
+ replace_scale_mmx()
#clean up the list of extensions
/*
pygame - Python Game Library
Copyright (C) 2000-2001 Pete Shinners
Copyright (C) 2007 Rene Dudfield, Richard Goedeken
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Pete Shinners
pete@xxxxxxxxxxxx
*/
/* Pentium 64 bit SSE/MMX smoothscale routines
* These are written for compilation with GCC only.
*
* This file should not depend on anything.
*/
#if !defined(__GNUC__)
#error "Pygame build bug: should not be compiling this file!"
#endif
#include <stdint.h>
typedef uint8_t Uint8; /* SDL convension */
typedef uint16_t Uint16; /* SDL convension */
#include <stdlib.h>
#include <memory.h>
#include "scale.h"
/* These functions implement an area-averaging shrinking filter in the X-dimension.
*/
void
filter_shrink_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
int srcdiff = srcpitch - (srcwidth * 4);
int dstdiff = dstpitch - (dstwidth * 4);
int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
int xrecip = 0x40000000 / xspace;
long long One64 = 0x4000400040004000ULL;
long long srcdiff64 = srcdiff;
long long dstdiff64 = dstdiff;
asm __volatile__(" /* MMX code for X-shrink area average filter */ "
" pxor %%mm0, %%mm0; "
" movd %6, %%mm7; " /* mm7 == xrecipmmx */
" punpcklwd %%mm7, %%mm7; "
" punpckldq %%mm7, %%mm7; "
"1: " /* outer Y-loop */
" movl %5, %%ecx; " /* ecx == xcounter */
" pxor %%mm1, %%mm1; " /* mm1 == accumulator */
" movl %4, %%edx; " /* edx == width */
"2: " /* inner X-loop */
" cmpl $0x4000, %%ecx; "
" jbe 3f; "
" movd (%0), %%mm2; " /* mm2 = srcpix */
" add $4, %0; "
" punpcklbw %%mm0, %%mm2; "
" paddw %%mm2, %%mm1; " /* accumulator += srcpix */
" subl $0x4000, %%ecx; "
" jmp 4f; "
"3: " /* prepare to output a pixel */
" movd %%ecx, %%mm2; "
" movq %2, %%mm3; " /* mm3 = 2^14 */
" punpcklwd %%mm2, %%mm2; "
" punpckldq %%mm2, %%mm2; "
" movd (%0), %%mm4; " /* mm4 = srcpix */
" add $4, %0; "
" punpcklbw %%mm0, %%mm4; "
" psubw %%mm2, %%mm3; " /* mm3 = xfrac */
" psllw $2, %%mm4; "
" movq %%mm4, %%mm5; " /* mm2 = (srcpix * xcounter >> 16) */
" psraw $15, %%mm5; "
" pand %%mm2, %%mm5; "
" movq %%mm2, %%mm6; "
" psraw $15, %%mm6; "
" pand %%mm4, %%mm6; "
" pmulhw %%mm4, %%mm2; "
" paddw %%mm5, %%mm2; "
" paddw %%mm6, %%mm2; "
" movq %%mm4, %%mm5; " /* mm3 = (srcpix * xfrac) >> 16) */
" psraw $15, %%mm5; "
" pand %%mm3, %%mm5; "
" movq %%mm3, %%mm6; "
" psraw $15, %%mm6; "
" pand %%mm4, %%mm6; "
" pmulhw %%mm4, %%mm3; "
" paddw %%mm5, %%mm3; "
" paddw %%mm6, %%mm3; "
" paddw %%mm1, %%mm2; "
" movq %%mm3, %%mm1; " /* accumulator = (srcpix * xfrac) >> 16 */
" movq %%mm7, %%mm5; "
" psraw $15, %%mm5; "
" pand %%mm2, %%mm5; "
" movq %%mm2, %%mm6; "
" psraw $15, %%mm6; "
" pand %%mm7, %%mm6; "
" pmulhw %%mm7, %%mm2; "
" paddw %%mm5, %%mm2; "
" paddw %%mm6, %%mm2; "
" packuswb %%mm0, %%mm2; "
" movd %%mm2, (%1); "
" add %5, %%ecx; "
" add $4, %1; "
" subl $0x4000, %%ecx; "
"4: " /* tail of inner X-loop */
" decl %%edx; "
" jne 2b; "
" add %7, %0; " /* srcpix += srcdiff */
" add %8, %1; " /* dstpix += dstdiff */
" decl %3; "
" jne 1b; "
" emms; "
: "+r"(srcpix), "+r"(dstpix) /* outputs */
: "m"(One64), "m"(height), "m"(srcwidth),
"m"(xspace), "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64) /* inputs */
: "%ecx","%edx" /* clobbered */
);
}
void
filter_shrink_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
int srcdiff = srcpitch - (srcwidth * 4);
int dstdiff = dstpitch - (dstwidth * 4);
int xspace = 0x04000 * srcwidth / dstwidth; /* must be > 1 */
int xrecip = 0x40000000 / xspace;
long long One64 = 0x4000400040004000ULL;
long long srcdiff64 = srcdiff;
long long dstdiff64 = dstdiff;
asm __volatile__(" /* MMX code for X-shrink area average filter */ "
" pxor %%mm0, %%mm0; "
" movd %6, %%mm7; " /* mm7 == xrecipmmx */
" movq %2, %%mm6; " /* mm6 = 2^14 */
" pshufw $0, %%mm7, %%mm7; "
"1: " /* outer Y-loop */
" movl %5, %%ecx; " /* ecx == xcounter */
" pxor %%mm1, %%mm1; " /* mm1 == accumulator */
" movl %4, %%edx; " /* edx == width */
"2: " /* inner X-loop */
" cmpl $0x4000, %%ecx; "
" jbe 3f; "
" movd (%0), %%mm2; " /* mm2 = srcpix */
" add $4, %0; "
" punpcklbw %%mm0, %%mm2; "
" paddw %%mm2, %%mm1; " /* accumulator += srcpix */
" subl $0x4000, %%ecx; "
" jmp 4f; "
"3: " /* prepare to output a pixel */
" movd %%ecx, %%mm2; "
" movq %%mm6, %%mm3; " /* mm3 = 2^14 */
" pshufw $0, %%mm2, %%mm2; "
" movd (%0), %%mm4; " /* mm4 = srcpix */
" add $4, %0; "
" punpcklbw %%mm0, %%mm4; "
" psubw %%mm2, %%mm3; " /* mm3 = xfrac */
" psllw $2, %%mm4; "
" pmulhuw %%mm4, %%mm2; " /* mm2 = (srcpix * xcounter >> 16) */
" pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * xfrac) >> 16 */
" paddw %%mm1, %%mm2; "
" movq %%mm3, %%mm1; " /* accumulator = (srcpix * xfrac) >> 16 */
" pmulhuw %%mm7, %%mm2; "
" packuswb %%mm0, %%mm2; "
" movd %%mm2, (%1); "
" add %5, %%ecx; "
" add $4, %1; "
" subl $0x4000, %%ecx; "
"4: " /* tail of inner X-loop */
" decl %%edx; "
" jne 2b; "
" add %7, %0; " /* srcpix += srcdiff */
" add %8, %1; " /* dstpix += dstdiff */
" decl %3; "
" jne 1b; "
" emms; "
: "+r"(srcpix), "+r"(dstpix) /* outputs */
: "m"(One64), "m"(height), "m"(srcwidth),
"m"(xspace), "m"(xrecip), "m"(srcdiff64), "m"(dstdiff64) /* inputs */
: "%ecx","%edx" /* clobbered */
);
}
/* These functions implement an area-averaging shrinking filter in the Y-dimension.
*/
void
filter_shrink_Y_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
//Uint16 *templine;
int srcdiff = srcpitch - (width * 4);
int dstdiff = dstpitch - (width * 4);
int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
int yrecip = 0x40000000 / yspace;
long long One64 = 0x4000400040004000ULL;
long long srcdiff64 = srcdiff;
long long dstdiff64 = dstdiff;
asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
" movl %5, %%ecx; " /* ecx == ycounter */
" pxor %%mm0, %%mm0; "
" movd %6, %%mm7; " /* mm7 == yrecipmmx */
" punpcklwd %%mm7, %%mm7; "
" punpckldq %%mm7, %%mm7; "
"1: " /* outer Y-loop */
" mov %2, %%rax; " /* rax == accumulate */
" cmpl $0x4000, %%ecx; "
" jbe 3f; "
" movl %4, %%edx; " /* edx == width */
"2: "
" movd (%0), %%mm1; "
" add $4, %0; "
" movq (%%rax), %%mm2; "
" punpcklbw %%mm0, %%mm1; "
" paddw %%mm1, %%mm2; "
" movq %%mm2, (%%rax); "
" add $8, %%rax; "
" decl %%edx; "
" jne 2b; "
" subl $0x4000, %%ecx; "
" jmp 6f; "
"3: " /* prepare to output a line */
" movd %%ecx, %%mm1; "
" movl %4, %%edx; " /* edx = width */
" movq %9, %%mm6; " /* mm6 = 2^14 */
" punpcklwd %%mm1, %%mm1; "
" punpckldq %%mm1, %%mm1; "
" psubw %%mm1, %%mm6; " /* mm6 = yfrac */
"4: "
" movd (%0), %%mm4; " /* mm4 = srcpix */
" add $4, %0; "
" punpcklbw %%mm0, %%mm4; "
" movq (%%rax), %%mm5; " /* mm5 = accumulate */
" movq %%mm6, %%mm3; "
" psllw $2, %%mm4; "
" movq %%mm4, %%mm0; " /* mm3 = (srcpix * yfrac) >> 16) */
" psraw $15, %%mm0; "
" pand %%mm3, %%mm0; "
" movq %%mm3, %%mm2; "
" psraw $15, %%mm2; "
" pand %%mm4, %%mm2; "
" pmulhw %%mm4, %%mm3; "
" paddw %%mm0, %%mm3; "
" paddw %%mm2, %%mm3; "
" movq %%mm1, %%mm0; " /* mm4 = (srcpix * ycounter >> 16) */
" psraw $15, %%mm0; "
" pand %%mm4, %%mm0; "
" movq %%mm4, %%mm2; "
" psraw $15, %%mm2; "
" pand %%mm1, %%mm2; "
" pmulhw %%mm1, %%mm4; "
" paddw %%mm0, %%mm4; "
" paddw %%mm2, %%mm4; "
" movq %%mm3, (%%rax); "
" paddw %%mm5, %%mm4; "
" add $8, %%rax; "
" movq %%mm7, %%mm0; "
" psraw $15, %%mm0; "
" pand %%mm4, %%mm0; "
" movq %%mm4, %%mm2; "
" psraw $15, %%mm2; "
" pand %%mm7, %%mm2; "
" pmulhw %%mm7, %%mm4; "
" paddw %%mm0, %%mm4; "
" paddw %%mm2, %%mm4; "
" pxor %%mm0, %%mm0; "
" packuswb %%mm0, %%mm4; "
" movd %%mm4, (%1); "
" add $4, %1; "
" decl %%edx; "
" jne 4b; "
" add %8, %1; " /* dstpix += dstdiff */
" addl %5, %%ecx; "
" subl $0x4000, %%ecx; "
"6: " /* tail of outer Y-loop */
" add %7, %0; " /* srcpix += srcdiff */
" decl %3; "
" jne 1b; "
" emms; "
: "+r"(srcpix), "+r"(dstpix) /* outputs */
: "m"(templine),"m"(srcheight), "m"(width), "m"(yspace),
"m"(yrecip), "m"(srcdiff64), "m"(dstdiff64), "m"(One64) /* input */
: "%ecx","%edx","%rax" /* clobbered */
);
}
void
filter_shrink_Y_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, Uint16 *templine, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
int srcdiff = srcpitch - (width * 4);
int dstdiff = dstpitch - (width * 4);
int yspace = 0x4000 * srcheight / dstheight; /* must be > 1 */
int yrecip = 0x40000000 / yspace;
long long One64 = 0x4000400040004000ULL;
long long srcdiff64 = srcdiff;
long long dstdiff64 = dstdiff;
asm __volatile__(" /* MMX code for Y-shrink area average filter */ "
" movl %5, %%ecx; " /* ecx == ycounter */
" pxor %%mm0, %%mm0; "
" movd %6, %%mm7; " /* mm7 == yrecipmmx */
" pshufw $0, %%mm7, %%mm7; "
"1: " /* outer Y-loop */
" mov %2, %%rax; " /* rax == accumulate */
" cmpl $0x4000, %%ecx; "
" jbe 3f; "
" movl %4, %%edx; " /* edx == width */
"2: "
" movd (%0), %%mm1; "
" add $4, %0; "
" movq (%%rax), %%mm2; "
" punpcklbw %%mm0, %%mm1; "
" paddw %%mm1, %%mm2; "
" movq %%mm2, (%%rax); "
" add $8, %%rax; "
" decl %%edx; "
" jne 2b; "
" subl $0x4000, %%ecx; "
" jmp 6f; "
"3: " /* prepare to output a line */
" movd %%ecx, %%mm1; "
" movl %4, %%edx; " /* edx = width */
" movq %9, %%mm6; " /* mm6 = 2^14 */
" pshufw $0, %%mm1, %%mm1; "
" psubw %%mm1, %%mm6; " /* mm6 = yfrac */
"4: "
" movd (%0), %%mm4; " /* mm4 = srcpix */
" add $4, %0; "
" punpcklbw %%mm0, %%mm4; "
" movq (%%rax), %%mm5; " /* mm5 = accumulate */
" movq %%mm6, %%mm3; "
" psllw $2, %%mm4; "
" pmulhuw %%mm4, %%mm3; " /* mm3 = (srcpix * yfrac) >> 16 */
" pmulhuw %%mm1, %%mm4; " /* mm4 = (srcpix * ycounter >> 16) */
" movq %%mm3, (%%rax); "
" paddw %%mm5, %%mm4; "
" add $8, %%rax; "
" pmulhuw %%mm7, %%mm4; "
" packuswb %%mm0, %%mm4; "
" movd %%mm4, (%1); "
" add $4, %1; "
" decl %%edx; "
" jne 4b; "
" add %8, %1; " /* dstpix += dstdiff */
" addl %5, %%ecx; "
" subl $0x4000, %%ecx; "
"6: " /* tail of outer Y-loop */
" add %7, %0; " /* srcpix += srcdiff */
" decl %3; "
" jne 1b; "
" emms; "
: "+r"(srcpix), "+r"(dstpix) /* outputs */
: "m"(templine),"m"(srcheight), "m"(width), "m"(yspace),
"m"(yrecip), "m"(srcdiff64), "m"(dstdiff64), "m"(One64) /* input */
: "%ecx","%edx","%rax" /* clobbered */
);
}
/* These functions implement a bilinear filter in the X-dimension.
*/
void
filter_expand_X_MMX_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
int x, y;
int factorwidth = 8;
/* Create multiplier factors and starting indices and put them in arrays */
for (x = 0; x < dstwidth; x++)
{
int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
int xm0 = 0x100 - xm1;
xidx0[x] = x * (srcwidth - 1) / dstwidth;
xmult1[x*2] = xm1 | (xm1 << 16);
xmult1[x*2+1] = xm1 | (xm1 << 16);
xmult0[x*2] = xm0 | (xm0 << 16);
xmult0[x*2+1] = xm0 | (xm0 << 16);
}
/* Do the scaling in raster order so we don't trash the cache */
for (y = 0; y < height; y++)
{
Uint8 *srcrow0 = srcpix + y * srcpitch;
Uint8 *dstrow = dstpix + y * dstpitch;
int *xm0 = xmult0;
int *xm1 = xmult1;
int *x0 = xidx0;
asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
" movl %5, %%ecx; "
" pxor %%mm0, %%mm0; "
"1: "
" movsxl (%3), %%rax; " /* get xidx0[x] */
" add $4, %3; "
" movq (%0), %%mm1; " /* load mult0 */
" add $8, %0; "
" movq (%1), %%mm2; " /* load mult1 */
" add $8, %1; "
" movd (%4,%%rax,4), %%mm4; "
" movd 4(%4,%%rax,4), %%mm5; "
" punpcklbw %%mm0, %%mm4; "
" punpcklbw %%mm0, %%mm5; "
" pmullw %%mm1, %%mm4; "
" pmullw %%mm2, %%mm5; "
" paddw %%mm4, %%mm5; "
" psrlw $8, %%mm5; "
" packuswb %%mm0, %%mm5; "
" movd %%mm5, (%2); "
" add $4, %2; "
" decl %%ecx; "
" jne 1b; "
" emms; "
: "+r"(xm0), "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
: "r"(srcrow0),"m"(dstwidth) /* input */
: "%ecx","%rax" /* clobbered */
);
}
}
void
filter_expand_X_SSE_gcc(Uint8 *srcpix, Uint8 *dstpix, int *xidx0, int *xmult0, int *xmult1, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
int x, y;
int factorwidth = 8;
/* Create multiplier factors and starting indices and put them in arrays */
for (x = 0; x < dstwidth; x++)
{
int xm1 = 0x100 * ((x * (srcwidth - 1)) % dstwidth) / dstwidth;
int xm0 = 0x100 - xm1;
xidx0[x] = x * (srcwidth - 1) / dstwidth;
xmult1[x*2] = xm1 | (xm1 << 16);
xmult1[x*2+1] = xm1 | (xm1 << 16);
xmult0[x*2] = xm0 | (xm0 << 16);
xmult0[x*2+1] = xm0 | (xm0 << 16);
}
/* Do the scaling in raster order so we don't trash the cache */
for (y = 0; y < height; y++)
{
Uint8 *srcrow0 = srcpix + y * srcpitch;
Uint8 *dstrow = dstpix + y * dstpitch;
int *xm0 = xmult0;
int *xm1 = xmult1;
int *x0 = xidx0;
asm __volatile__( " /* MMX code for inner loop of X bilinear filter */ "
" movl %5, %%ecx; "
" pxor %%mm0, %%mm0; "
"1: "
" movsxl (%3), %%rax; " /* get xidx0[x] */
" add $4, %3; "
" movq (%0), %%mm1; " /* load mult0 */
" add $8, %0; "
" movq (%1), %%mm2; " /* load mult1 */
" add $8, %1; "
" movd (%4,%%rax,4), %%mm4; "
" movd 4(%4,%%rax,4), %%mm5; "
" punpcklbw %%mm0, %%mm4; "
" punpcklbw %%mm0, %%mm5; "
" pmullw %%mm1, %%mm4; "
" pmullw %%mm2, %%mm5; "
" paddw %%mm4, %%mm5; "
" psrlw $8, %%mm5; "
" packuswb %%mm0, %%mm5; "
" movd %%mm5, (%2); "
" add $4, %2; "
" decl %%ecx; "
" jne 1b; "
" emms; "
: "+r"(xm0), "+r"(xm1), "+r"(dstrow), "+r"(x0) /* outputs */
: "r"(srcrow0),"m"(dstwidth) /* input */
: "%ecx","%rax" /* clobbered */
);
}
}
/* These functions implement a bilinear filter in the Y-dimension
*/
void
filter_expand_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
int y;
for (y = 0; y < dstheight; y++)
{
int yidx0 = y * (srcheight - 1) / dstheight;
Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
Uint8 *srcrow1 = srcrow0 + srcpitch;
int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
int ymult0 = 0x0100 - ymult1;
Uint8 *dstrow = dstpix + y * dstpitch;
asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
" movl %5, %%ecx; "
" movd %3, %%mm1; "
" movd %4, %%mm2; "
" pxor %%mm0, %%mm0; "
" punpcklwd %%mm1, %%mm1; "
" punpckldq %%mm1, %%mm1; "
" punpcklwd %%mm2, %%mm2; "
" punpckldq %%mm2, %%mm2; "
"1: "
" movd (%0), %%mm4; "
" add $4, %0; "
" movd (%1), %%mm5; "
" add $4, %1; "
" punpcklbw %%mm0, %%mm4; "
" punpcklbw %%mm0, %%mm5; "
" pmullw %%mm1, %%mm4; "
" pmullw %%mm2, %%mm5; "
" paddw %%mm4, %%mm5; "
" psrlw $8, %%mm5; "
" packuswb %%mm0, %%mm5; "
" movd %%mm5, (%2); "
" add $4, %2; "
" decl %%ecx; "
" jne 1b; "
" emms; "
: "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow) /* outputs */
: "m"(ymult0), "m"(ymult1), "m"(width) /* input */
: "%ecx" /* clobbered */
);
}
}
void
filter_expand_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
int y;
for (y = 0; y < dstheight; y++)
{
int yidx0 = y * (srcheight - 1) / dstheight;
Uint8 *srcrow0 = srcpix + yidx0 * srcpitch;
Uint8 *srcrow1 = srcrow0 + srcpitch;
int ymult1 = 0x0100 * ((y * (srcheight - 1)) % dstheight) / dstheight;
int ymult0 = 0x0100 - ymult1;
Uint8 *dstrow = dstpix + y * dstpitch;
asm __volatile__( " /* MMX code for inner loop of Y bilinear filter */ "
" movl %5, %%ecx; "
" movd %3, %%mm1; "
" movd %4, %%mm2; "
" pxor %%mm0, %%mm0; "
" pshufw $0, %%mm1, %%mm1; "
" pshufw $0, %%mm2, %%mm2; "
"1: "
" movd (%0), %%mm4; "
" add $4, %0; "
" movd (%1), %%mm5; "
" add $4, %1; "
" punpcklbw %%mm0, %%mm4; "
" punpcklbw %%mm0, %%mm5; "
" pmullw %%mm1, %%mm4; "
" pmullw %%mm2, %%mm5; "
" paddw %%mm4, %%mm5; "
" psrlw $8, %%mm5; "
" packuswb %%mm0, %%mm5; "
" movd %%mm5, (%2); "
" add $4, %2; "
" decl %%ecx; "
" jne 1b; "
" emms; "
: "+r"(srcrow0), "+r"(srcrow1), "+r"(dstrow) /* outputs */
: "m"(ymult0), "m"(ymult1), "m"(width) /* input */
: "%ecx" /* clobbered */
);
}
}
/*
pygame - Python Game Library
Copyright (C) 2000-2001 Pete Shinners
Copyright (C) 2007 Rene Dudfield, Richard Goedeken
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Pete Shinners
pete@xxxxxxxxxxxx
*/
/* Pentium 64 bit SSE/MMX smoothscale routines
* These are written for compilation with MSVC only.
*
* This file should not depend on anything but the C standard library.
*/
#include <stdint.h>
typedef uint8_t Uint8; /* SDL convension */
typedef uint16_t Uint16; /* SDL convension */
#include <stdlib.h>
#include <memory.h>
#include "scale.h"
/* These functions implement an area-averaging shrinking filter in the Y-dimension.
*/
void
filter_shrink_Y_MMX(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
Uint16 *templine;
/* allocate and clear a memory area for storing the accumulator line */
templine = (Uint16 *) malloc(dstpitch * 2);
if (templine == 0) return;
memset(templine, 0, dstpitch * 2);
filter_shrink_Y_MMX_gcc(srcpix, dstpix, templine, width, srcpitch, dstpitch, srcheight, dstheight);
/* free the temporary memory */
free(templine);
}
void
filter_shrink_Y_SSE(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch, int dstpitch, int srcheight, int dstheight)
{
Uint16 *templine;
/* allocate and clear a memory area for storing the accumulator line */
templine = (Uint16 *) malloc(dstpitch * 2);
if (templine == 0) return;
memset(templine, 0, dstpitch * 2);
filter_shrink_Y_SSE_gcc(srcpix, dstpix, templine, width, srcpitch, dstpitch, srcheight, dstheight);
/* free the temporary memory */
free(templine);
}
/* These functions implement a bilinear filter in the X-dimension.
*/
void
filter_expand_X_MMX(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
int *xidx0, *xmult0, *xmult1;
int factorwidth = 8;
/* Allocate memory for factors */
xidx0 = malloc(dstwidth * 4);
if (xidx0 == 0) return;
xmult0 = (int *) malloc(dstwidth * factorwidth);
xmult1 = (int *) malloc(dstwidth * factorwidth);
if (xmult0 == 0 || xmult1 == 0)
{
free(xidx0);
if (xmult0) free(xmult0);
if (xmult1) free(xmult1);
}
filter_expand_X_MMX_gcc(srcpix, dstpix, xidx0, xmult0, xmult1, height, srcpitch, dstpitch, srcwidth, dstwidth);
/* free memory */
free(xidx0);
free(xmult0);
free(xmult1);
}
void
filter_expand_X_SSE(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch, int dstpitch, int srcwidth, int dstwidth)
{
int *xidx0, *xmult0, *xmult1;
int factorwidth = 8;
/* Allocate memory for factors */
xidx0 = malloc(dstwidth * 4);
if (xidx0 == 0) return;
xmult0 = (int *) malloc(dstwidth * factorwidth);
xmult1 = (int *) malloc(dstwidth * factorwidth);
if (xmult0 == 0 || xmult1 == 0)
{
free(xidx0);
if (xmult0) free(xmult0);
if (xmult1) free(xmult1);
}
filter_expand_X_SSE_gcc(srcpix, dstpix, xidx0, xmult0, xmult1, height, srcpitch, dstpitch, srcwidth, dstwidth);
/* free memory */
free(xidx0);
free(xmult0);
free(xmult1);
}