Index: libmpeg2/mpeg2_internal.h =================================================================== --- libmpeg2/mpeg2_internal.h (revision 18701) +++ libmpeg2/mpeg2_internal.h (working copy) @@ -309,6 +309,7 @@ extern mpeg2_mc_t mpeg2_mc_c; extern mpeg2_mc_t mpeg2_mc_mmx; extern mpeg2_mc_t mpeg2_mc_mmxext; extern mpeg2_mc_t mpeg2_mc_3dnow; +extern mpeg2_mc_t mpeg2_mc_sse2; extern mpeg2_mc_t mpeg2_mc_altivec; extern mpeg2_mc_t mpeg2_mc_alpha; extern mpeg2_mc_t mpeg2_mc_vis; Index: libmpeg2/cpu_accel.c =================================================================== --- libmpeg2/cpu_accel.c (revision 18701) +++ libmpeg2/cpu_accel.c (working copy) @@ -26,6 +26,7 @@ */ #include "config.h" +#include "cpudetect.h" #include @@ -37,78 +38,22 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) static inline uint32_t arch_accel (void) { - uint32_t eax, ebx, ecx, edx; - int AMD; - uint32_t caps; - -#if !defined(PIC) && !defined(__PIC__) -#define cpuid(op,eax,ebx,ecx,edx) \ - __asm__ ("cpuid" \ - : "=a" (eax), \ - "=b" (ebx), \ - "=c" (ecx), \ - "=d" (edx) \ - : "a" (op) \ - : "cc") -#else /* PIC version : save ebx */ -#define cpuid(op,eax,ebx,ecx,edx) \ - __asm__ ("push %%ebx\n\t" \ - "cpuid\n\t" \ - "movl %%ebx,%1\n\t" \ - "pop %%ebx" \ - : "=a" (eax), \ - "=r" (ebx), \ - "=c" (ecx), \ - "=d" (edx) \ - : "a" (op) \ - : "cc") -#endif - - __asm__ ("pushf\n\t" - "pushf\n\t" - "pop %0\n\t" - "movl %0,%1\n\t" - "xorl $0x200000,%0\n\t" - "push %0\n\t" - "popf\n\t" - "pushf\n\t" - "pop %0\n\t" - "popf" - : "=r" (eax), - "=r" (ebx) - : - : "cc"); - - if (eax == ebx) /* no cpuid */ - return 0; - - cpuid (0x00000000, eax, ebx, ecx, edx); - if (!eax) /* vendor string only */ - return 0; - - AMD = (ebx == 0x68747541) && (ecx == 0x444d4163) && (edx == 0x69746e65); - - cpuid (0x00000001, eax, ebx, ecx, edx); - if (! (edx & 0x00800000)) /* no MMX */ - return 0; - - caps = MPEG2_ACCEL_X86_MMX; - if (edx & 0x02000000) /* SSE - identical to AMD MMX extensions */ - caps = MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT; - - cpuid (0x80000000, eax, ebx, ecx, edx); - if (eax < 0x80000001) /* no extended capabilities */ - return caps; - - cpuid (0x80000001, eax, ebx, ecx, edx); - - if (edx & 0x80000000) - caps |= MPEG2_ACCEL_X86_3DNOW; - - if (AMD && (edx & 0x00400000)) /* AMD MMX extensions */ - caps |= MPEG2_ACCEL_X86_MMXEXT; - - return caps; +#if defined(HAVE_SSE2) + if (gCpuCaps.hasSSE2) { + return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT | MPEG2_ACCEL_X86_SSE2; + } +#endif +#if defined(HAVE_MMX2) || defined(HAVE_SSE) + if (gCpuCaps.hasSSE || gCpuCaps.hasMMX2) + return MPEG2_ACCEL_X86_MMX | MPEG2_ACCEL_X86_MMXEXT; + } +#endif +#if defined(HAVE_MMX) + if (gCpuCaps.hasMMX || gCpuCaps.has3DNow || gCpuCaps.has3DNowExt) + return MPEG2_ACCEL_X86_MMX; + } +#endif + return 0; } #endif /* ARCH_X86 || ARCH_X86_64 */ Index: libmpeg2/motion_comp_sse2.c =================================================================== --- libmpeg2/motion_comp_sse2.c (revision 0) +++ libmpeg2/motion_comp_sse2.c (revision 0) @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2006 + * Jim Huang + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * Based on Intel's AP-942 + */ + +#include "config.h" + +#if defined(HAVE_BUILTIN_VECTOR) + +#if defined(ARCH_X86) || defined(ARCH_X86_64) +#ifdef HAVE_SSE2 + +#include +#include "mpeg2.h" +#include "attributes.h" +#include "mpeg2_internal.h" + +#ifdef __GNUC__ + #ifndef __forceinline + #define __forceinline __attribute__((__always_inline__)) inline + #endif +#endif + +#ifdef __GNUC__ + #define __inline __forceinline // GCC needs to force inlining of intrinsics functions +#endif + +#include +#include +#include + +#ifdef __GNUC__ + #undef __inline +#endif + +#ifdef __GNUC__ + #define __align8(t,v) t v __attribute__ ((aligned (8))) + #define __align16(t,v) t v __attribute__ ((aligned (16))) +#else + #define __align8(t,v) __declspec(align(8)) t v + #define __align16(t,v) __declspec(align(16)) t v +#endif + +static __m128i const_1_16_bytes; +static void __attribute__((constructor)) mpeg2_MC_sse_ctor() +{ + const_1_16_bytes = _mm_set1_epi16(1); +} + +static void MC_put_o_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) +{ + const int edi = eax + eax; + for (; esi; edx += edi, ecx += edi, esi -= 2) { + __m128i xmm0, xmm1; + xmm0 = _mm_loadu_si128((__m128i*) edx); + xmm1 = _mm_loadu_si128((__m128i*) (edx + eax)); + _mm_store_si128((__m128i*) ecx, xmm0); + _mm_store_si128((__m128i*) (ecx + eax), xmm1); + } +} + +static void MC_put_o_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) +{ + const int edi = eax + eax; + for (; esi; edx += edi, ecx += edi, esi-= 2) { + __m128d xmm0; + xmm0 = _mm_loadl_pd(xmm0, (double*) edx); + xmm0 = _mm_loadh_pd(xmm0, (double*) (edx + eax)); + _mm_storel_pd((double*) ecx, xmm0); + _mm_storeh_pd((double*) (ecx + eax), xmm0); + } +} + +static void MC_put_x_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) +{ + const int edi= eax + eax; + for (; esi; edx += edi, ecx += edi, esi -= 2) { + __m128i xmm0, xmm1, xmm2, xmm3; + xmm0 = _mm_loadu_si128((__m128i*) edx); + xmm1 = _mm_loadu_si128((__m128i*) (edx + 1)); + xmm2 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1)); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_avg_epu8(xmm2, xmm3); + _mm_store_si128((__m128i*) ecx, xmm0); + _mm_store_si128((__m128i*) (ecx + eax), xmm2); + } +} + +static void MC_put_x_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) +{ + const int edi = eax + eax; + __m128i xmm0,xmm1; + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1))); + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + } +} + +static void MC_put_y_16_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) +{ + const int edi= eax + eax; + __m128i xmm0; + xmm0 = _mm_loadu_si128((__m128i*) edx); + for (; esi; edx += edi, ecx += edi, esi -= 2) { + __m128i xmm1,xmm2; + xmm1 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm2 = _mm_loadu_si128((__m128i*) (edx + edi)); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm1 = _mm_avg_epu8(xmm1, xmm2); + _mm_store_si128((__m128i*) ecx, xmm0); + xmm0 = xmm2; + _mm_store_si128((__m128i*) (ecx + eax), xmm1); + } +} + +static void MC_put_y_8_sse2(uint8_t* ecx, const uint8_t* edx, const int eax, int esi) +{ + const int edi = eax + eax; + __m128i xmm0; + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + for (; esi; edx += edi, ecx += edi, esi -= 2) { + __m128i xmm1; + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + xmm0 = xmm1; + } +} + +static void MC_put_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int eax = stride; + int esi = height; + int edi = eax + eax; + __m128i xmm7, xmm0, xmm1, xmm4, xmm5, xmm2, xmm3; + xmm7 = const_1_16_bytes; + xmm0 = _mm_loadu_si128((__m128i*) edx); + xmm1 = _mm_loadu_si128((__m128i*) (edx + 1)); + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm2 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1)); + xmm4 = _mm_loadu_si128((__m128i*) (edx + edi)); + xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1)); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_avg_epu8(xmm2, xmm3); + xmm1 = xmm5; + xmm5 = _mm_avg_epu8(xmm5, xmm4); + xmm2 = _mm_subs_epu8(xmm2, xmm7); + xmm0 = _mm_avg_epu8(xmm0, xmm2); + xmm2 = _mm_avg_epu8(xmm2, xmm5); + _mm_store_si128((__m128i*) ecx, xmm0); + xmm0 = xmm4; + _mm_store_si128((__m128i*) (ecx + eax), xmm2); + } +} + +static void MC_put_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int eax = stride; + int esi = height; + int edi = eax + eax; + __m128i xmm7, xmm0, xmm2, xmm1, xmm3; + xmm7 = const_1_16_bytes; + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1))); + xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1))); + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi))); + xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1))); + xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_avg_epu8(xmm2, xmm3); + xmm0 = _mm_subs_epu8(xmm0, xmm7); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + xmm0 = xmm1; + xmm2 = xmm3; + } +} + +static void MC_avg_o_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + for (; esi; edx += edi, ecx += edi,esi -= 2) { + __m128i xmm0, xmm1, xmm2, xmm3; + xmm0 = _mm_loadu_si128((__m128i*) edx); + xmm1 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm2 = _mm_load_si128((__m128i*) ecx); + xmm3 = _mm_load_si128((__m128i*) (ecx + eax)); + xmm0 = _mm_avg_epu8(xmm0, xmm2); + xmm1 = _mm_avg_epu8(xmm1, xmm3); + _mm_store_si128((__m128i*) ecx, xmm0); + _mm_store_si128((__m128i*) (ecx + eax), xmm1); + } +} + +static void MC_avg_o_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm0, xmm1; + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) ecx)); + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + } +} + +static void MC_avg_x_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5; + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm0 = _mm_loadu_si128((__m128i*) edx); + xmm1 = _mm_loadu_si128((__m128i*) (edx + 1)); + xmm2 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1)); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_avg_epu8(xmm2, xmm3); + xmm4 = _mm_load_si128((__m128i*) ecx); + xmm5 = _mm_load_si128((__m128i*) (ecx + eax)); + xmm0 = _mm_avg_epu8(xmm0, xmm4); + xmm2 = _mm_avg_epu8(xmm2, xmm5); + _mm_store_si128((__m128i*) ecx, xmm0); + _mm_store_si128((__m128i*) (ecx + eax), xmm2); + } +} + +static void MC_avg_x_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm0, xmm1, xmm2; + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + 1))); + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax + 1))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) ecx)); + xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax))); + xmm0 = _mm_avg_epu8(xmm0, xmm2); + _mm_storel_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storeh_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + } +} + +static void MC_avg_y_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm0, xmm1, xmm2, xmm3, xmm4; + + xmm0 = _mm_loadu_si128((__m128i*) edx); + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm1 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm2 = _mm_loadu_si128((__m128i*) (edx + edi)); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm1 = _mm_avg_epu8(xmm1, xmm2); + xmm3 = _mm_load_si128((__m128i*) ecx); + xmm4 = _mm_load_si128((__m128i*) (ecx + eax)); + xmm0 = _mm_avg_epu8(xmm0, xmm3); + xmm1 = _mm_avg_epu8(xmm1, xmm4); + _mm_store_si128((__m128i*) ecx, xmm0); + xmm0 = xmm2; + _mm_store_si128((__m128i*) (ecx + eax), xmm1); + } +} + +static void MC_avg_y_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm0, xmm1, xmm2; + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + for (; esi; edx += edi, ecx += edi, esi -= 2) { + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) ecx)); + xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (ecx + eax))); + xmm0 = _mm_avg_epu8(xmm0, xmm2); + _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + xmm0 = xmm1; + } +} + +static void MC_avg_xy_16_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm7, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; + xmm7 = const_1_16_bytes; + xmm0 = _mm_loadu_si128((__m128i*) edx); + xmm1 = _mm_loadu_si128((__m128i*) (edx + 1)); + for (; esi; edx += edi, ecx += edi, esi-= 2) { + xmm2 = _mm_loadu_si128((__m128i*) (edx + eax)); + xmm3 = _mm_loadu_si128((__m128i*) (edx + eax + 1)); + xmm4 = _mm_loadu_si128((__m128i*) (edx + edi)); + xmm5 = _mm_loadu_si128((__m128i*) (edx + edi + 1)); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_avg_epu8(xmm2, xmm3); + xmm1 = xmm5; + xmm5 = _mm_avg_epu8(xmm5, xmm4); + xmm2 = _mm_subs_epu8(xmm2, xmm7); + xmm0 = _mm_avg_epu8(xmm0, xmm2); + xmm2 = _mm_avg_epu8(xmm2, xmm5); + xmm5 = _mm_load_si128((__m128i*) ecx); + xmm6 = _mm_load_si128((__m128i*) (ecx + eax)); + xmm0 = _mm_avg_epu8(xmm0, xmm5); + xmm2 = _mm_avg_epu8(xmm2, xmm6); + _mm_store_si128((__m128i*) ecx, xmm0); + xmm0 = xmm4; + _mm_store_si128((__m128i*) (ecx + eax), xmm2); + } +} + +static void MC_avg_xy_8_sse2(uint8_t* dest, const uint8_t* ref, const int stride, int height) +{ + const uint8_t *edx = ref; + uint8_t *ecx = dest; + int esi = height; + int eax = stride; + int edi = eax + eax; + __m128i xmm7, xmm0, xmm2, xmm1, xmm3, xmm4; + xmm7 = const_1_16_bytes; + xmm0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm0), (double*) edx)); + xmm0 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm0), (double*) (edx + eax))); + xmm2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm2), (double*) (edx + 1))); + xmm2 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm2), (double*) (edx + eax + 1))); + for (;esi;edx+=edi,ecx+=edi, esi -= 2) { + xmm1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm1), (double*) (edx + eax))); + xmm1 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm1), (double*) (edx + edi))); + xmm3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm3), (double*) (edx + eax + 1))); + xmm3 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm3), (double*) (edx + edi + 1))); + xmm0 = _mm_avg_epu8(xmm0, xmm1); + xmm2 = _mm_avg_epu8(xmm2, xmm3); + xmm0 = _mm_subs_epu8(xmm0, xmm7); + xmm0 = _mm_avg_epu8(xmm0, xmm2); + xmm4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(xmm4), (double*) ecx)); + xmm4 = _mm_castpd_si128(_mm_loadl_pd(_mm_castsi128_pd(xmm4), (double*) (ecx + eax))); + xmm0 = _mm_avg_epu8(xmm0, xmm4); + _mm_storeh_pd((double*) ecx, _mm_castsi128_pd(xmm0)); + _mm_storel_pd((double*) (ecx + eax), _mm_castsi128_pd(xmm0)); + xmm0 = xmm1; + xmm2 = xmm3; + } +} + +MPEG2_MC_EXTERN (sse2) + +#endif +#endif +#endif Index: libmpeg2/mpeg2.h =================================================================== --- libmpeg2/mpeg2.h (revision 18701) +++ libmpeg2/mpeg2.h (working copy) @@ -159,6 +159,7 @@ void mpeg2_custom_fbuf (mpeg2dec_t * mpe #define MPEG2_ACCEL_X86_MMX 1 #define MPEG2_ACCEL_X86_3DNOW 2 #define MPEG2_ACCEL_X86_MMXEXT 4 +#define MPEG2_ACCEL_X86_SSE2 8 #define MPEG2_ACCEL_PPC_ALTIVEC 1 #define MPEG2_ACCEL_ALPHA 1 #define MPEG2_ACCEL_ALPHA_MVI 2 Index: libmpeg2/motion_comp.c =================================================================== --- libmpeg2/motion_comp.c (revision 18701) +++ libmpeg2/motion_comp.c (working copy) @@ -26,6 +26,7 @@ */ #include "config.h" +#include "cpudetect.h" #include @@ -38,6 +39,14 @@ mpeg2_mc_t mpeg2_mc; void mpeg2_mc_init (uint32_t accel) { #if defined(ARCH_X86) || defined(ARCH_X86_64) +#if defined(HAVE_SSE2) && defined(HAVE_BUILTIN_VECTOR) + /* SSE2-optimized MC depends on MMX intrinsics. */ + if (gCpuCaps.hasSSE2 || (accel & MPEG2_ACCEL_X86_SSE2)) { + mpeg2_mc = mpeg2_mc_sse2; + return; + } + else +#endif if (accel & MPEG2_ACCEL_X86_MMXEXT) mpeg2_mc = mpeg2_mc_mmxext; else if (accel & MPEG2_ACCEL_X86_3DNOW) Index: libmpeg2/Makefile =================================================================== --- libmpeg2/Makefile (revision 18701) +++ libmpeg2/Makefile (working copy) @@ -24,6 +24,10 @@ ifeq ($(TARGET_ARCH_X86_64),yes) SRCS += idct_mmx.c motion_comp_mmx.c endif +ifeq ($(TARGET_SSE),yes) +SRCS += motion_comp_sse2.c +endif + ifeq ($(TARGET_ALTIVEC),yes) SRCS += motion_comp_altivec.c idct_altivec.c endif