/*


* MMX optimized DSP utils

* Copyright (c) 2000, 2001 Fabrice Bellard

* Copyright (c) 20022004 Michael Niedermayer <michaelni@gmx.at>

*

* This file is part of FFmpeg.

*

* FFmpeg is free software; you can redistribute it and/or

* modify it under the terms of the GNU Lesser General Public

* License as published by the Free Software Foundation; either

* version 2.1 of the License, or (at your option) any later version.

*

* FFmpeg is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

* Lesser General Public License for more details.

*

* You should have received a copy of the GNU Lesser General Public

* License along with FFmpeg; if not, write to the Free Software

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

*

* MMX optimization by Nick Kurshev <nickols_k@mail.ru>

*/

#include "libavutil/x86_cpu.h" 
#include "libavcodec/dsputil.h" 
#include "libavcodec/mpegvideo.h" 
#include "libavcodec/mathops.h" 
#include "dsputil_mmx.h" 
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 
{ 
__asm__ volatile(

"mov $128, %%"REG_a" \n\t" 
"pxor %%mm7, %%mm7 \n\t"

ASMALIGN(4)

"1: \n\t"

"movq (%0), %%mm0 \n\t"

"movq (%0, %2), %%mm2 \n\t"

"movq %%mm0, %%mm1 \n\t"

"movq %%mm2, %%mm3 \n\t"

"punpcklbw %%mm7, %%mm0 \n\t"

"punpckhbw %%mm7, %%mm1 \n\t"

"punpcklbw %%mm7, %%mm2 \n\t"

"punpckhbw %%mm7, %%mm3 \n\t"

"movq %%mm0, (%1, %%"REG_a") \n\t" 
"movq %%mm1, 8(%1, %%"REG_a") \n\t" 
"movq %%mm2, 16(%1, %%"REG_a") \n\t" 
"movq %%mm3, 24(%1, %%"REG_a") \n\t" 
"add %3, %0 \n\t"

"add $32, %%"REG_a" \n\t" 
"js 1b \n\t"

: "+r" (pixels)

: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2) 
: "%"REG_a

); 
} 
static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size) 
{ 
__asm__ volatile(

"pxor %%xmm7, %%xmm7 \n\t"

"movq (%0), %%xmm0 \n\t"

"movq (%0, %2), %%xmm1 \n\t"

"movq (%0, %2,2), %%xmm2 \n\t"

"movq (%0, %3), %%xmm3 \n\t"

"lea (%0,%2,4), %0 \n\t"

"punpcklbw %%xmm7, %%xmm0 \n\t"

"punpcklbw %%xmm7, %%xmm1 \n\t"

"punpcklbw %%xmm7, %%xmm2 \n\t"

"punpcklbw %%xmm7, %%xmm3 \n\t"

"movdqa %%xmm0, (%1) \n\t"

"movdqa %%xmm1, 16(%1) \n\t"

"movdqa %%xmm2, 32(%1) \n\t"

"movdqa %%xmm3, 48(%1) \n\t"

"movq (%0), %%xmm0 \n\t"

"movq (%0, %2), %%xmm1 \n\t"

"movq (%0, %2,2), %%xmm2 \n\t"

"movq (%0, %3), %%xmm3 \n\t"

"punpcklbw %%xmm7, %%xmm0 \n\t"

"punpcklbw %%xmm7, %%xmm1 \n\t"

"punpcklbw %%xmm7, %%xmm2 \n\t"

"punpcklbw %%xmm7, %%xmm3 \n\t"

"movdqa %%xmm0, 64(%1) \n\t"

"movdqa %%xmm1, 80(%1) \n\t"

"movdqa %%xmm2, 96(%1) \n\t"

"movdqa %%xmm3, 112(%1) \n\t"

: "+r" (pixels)

: "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3) 
); 
} 
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 
{ 
__asm__ volatile(

"pxor %%mm7, %%mm7 \n\t"

"mov $128, %%"REG_a" \n\t" 
ASMALIGN(4)

"1: \n\t"

"movq (%0), %%mm0 \n\t"

"movq (%1), %%mm2 \n\t"

"movq %%mm0, %%mm1 \n\t"

"movq %%mm2, %%mm3 \n\t"

"punpcklbw %%mm7, %%mm0 \n\t"

"punpckhbw %%mm7, %%mm1 \n\t"

"punpcklbw %%mm7, %%mm2 \n\t"

"punpckhbw %%mm7, %%mm3 \n\t"

"psubw %%mm2, %%mm0 \n\t"

"psubw %%mm3, %%mm1 \n\t"

"movq %%mm0, (%2, %%"REG_a") \n\t" 
"movq %%mm1, 8(%2, %%"REG_a") \n\t" 
"add %3, %0 \n\t"

"add %3, %1 \n\t"

"add $16, %%"REG_a" \n\t" 
"jnz 1b \n\t"

: "+r" (s1), "+r" (s2) 
: "r" (block+64), "r" ((x86_reg)stride) 
: "%"REG_a

); 
} 
static int pix_sum16_mmx(uint8_t * pix, int line_size){ 
const int h=16; 
int sum;

x86_reg index= line_size*h; 
__asm__ volatile(

"pxor %%mm7, %%mm7 \n\t"

"pxor %%mm6, %%mm6 \n\t"

"1: \n\t"

"movq (%2, %1), %%mm0 \n\t"

"movq (%2, %1), %%mm1 \n\t"

"movq 8(%2, %1), %%mm2 \n\t"

"movq 8(%2, %1), %%mm3 \n\t"

"punpcklbw %%mm7, %%mm0 \n\t"

"punpckhbw %%mm7, %%mm1 \n\t"

"punpcklbw %%mm7, %%mm2 \n\t"

"punpckhbw %%mm7, %%mm3 \n\t"

"paddw %%mm0, %%mm1 \n\t"

"paddw %%mm2, %%mm3 \n\t"

"paddw %%mm1, %%mm3 \n\t"

"paddw %%mm3, %%mm6 \n\t"

"add %3, %1 \n\t"

" js 1b \n\t"

"movq %%mm6, %%mm5 \n\t"

"psrlq $32, %%mm6 \n\t"

"paddw %%mm5, %%mm6 \n\t"

"movq %%mm6, %%mm5 \n\t"

"psrlq $16, %%mm6 \n\t"

"paddw %%mm5, %%mm6 \n\t"

"movd %%mm6, %0 \n\t"

"andl $0xFFFF, %0 \n\t"

: "=&r" (sum), "+r" (index) 
: "r" (pix  index), "r" ((x86_reg)line_size) 
); 
return sum;

} 
static int pix_norm1_mmx(uint8_t *pix, int line_size) { 
int tmp;

__asm__ volatile (

"movl $16,%%ecx\n"

"pxor %%mm0,%%mm0\n"

"pxor %%mm7,%%mm7\n"

"1:\n"

"movq (%0),%%mm2\n" /* mm2 = pix[07] */ 
"movq 8(%0),%%mm3\n" /* mm3 = pix[815] */ 
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[07] */ 
173 
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix47] */ 
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix03] */ 
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[815] */ 
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix1215] */ 
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix811] */ 
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 
"pmaddwd %%mm3,%%mm3\n"

"pmaddwd %%mm4,%%mm4\n"

"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 
pix2^2+pix3^2+pix6^2+pix7^2) */

"paddd %%mm3,%%mm4\n"

"paddd %%mm2,%%mm7\n"

"add %2, %0\n"

"paddd %%mm4,%%mm7\n"

"dec %%ecx\n"

"jnz 1b\n"

"movq %%mm7,%%mm1\n"

"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 
"paddd %%mm7,%%mm1\n"

"movd %%mm1,%1\n"

: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" ); 
return tmp;

} 
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
int tmp;

__asm__ volatile (

"movl %4,%%ecx\n"

"shr $1,%%ecx\n"

"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 
"1:\n"

"movq (%0),%%mm1\n" /* mm1 = pix1[0][07] */ 
"movq (%1),%%mm2\n" /* mm2 = pix2[0][07] */ 
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][07] */ 
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][07] */ 
/* todo: mm1mm2, mm3mm4 */

/* algo: subtract mm1 from mm2 with saturation and vice versa */

/* OR the results to get absolute difference */

"movq %%mm1,%%mm5\n"

"movq %%mm3,%%mm6\n"

"psubusb %%mm2,%%mm1\n"

"psubusb %%mm4,%%mm3\n"

"psubusb %%mm5,%%mm2\n"

"psubusb %%mm6,%%mm4\n"

"por %%mm1,%%mm2\n"

"por %%mm3,%%mm4\n"

/* now convert to 16bit vectors so we can square them */

"movq %%mm2,%%mm1\n"

"movq %%mm4,%%mm3\n"

"punpckhbw %%mm0,%%mm2\n"

"punpckhbw %%mm0,%%mm4\n"

"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 
"pmaddwd %%mm2,%%mm2\n"

"pmaddwd %%mm4,%%mm4\n"

"pmaddwd %%mm1,%%mm1\n"

"pmaddwd %%mm3,%%mm3\n"

"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 
"paddd %%mm2,%%mm1\n"

"paddd %%mm4,%%mm3\n"

"paddd %%mm1,%%mm7\n"

"paddd %%mm3,%%mm7\n"

252 
253 
254  
"movq %%mm7,%%mm1\n"

"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 
"paddd %%mm7,%%mm1\n"

"movd %%mm1,%2\n"

: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
: "r" ((x86_reg)line_size) , "m" (h) 
: "%ecx");

return tmp;

} 
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
int tmp;

__asm__ volatile (

"movl %4,%%ecx\n"

"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 
"1:\n"

"movq (%0),%%mm1\n" /* mm1 = pix1[07] */ 
"movq (%1),%%mm2\n" /* mm2 = pix2[07] */ 
"movq 8(%0),%%mm3\n" /* mm3 = pix1[815] */ 
"movq 8(%1),%%mm4\n" /* mm4 = pix2[815] */ 
/* todo: mm1mm2, mm3mm4 */

/* algo: subtract mm1 from mm2 with saturation and vice versa */

/* OR the results to get absolute difference */

"movq %%mm1,%%mm5\n"

"movq %%mm3,%%mm6\n"

"psubusb %%mm2,%%mm1\n"

"psubusb %%mm4,%%mm3\n"

"psubusb %%mm5,%%mm2\n"

"psubusb %%mm6,%%mm4\n"

"por %%mm1,%%mm2\n"

"por %%mm3,%%mm4\n"

/* now convert to 16bit vectors so we can square them */

"movq %%mm2,%%mm1\n"

"movq %%mm4,%%mm3\n"

"punpckhbw %%mm0,%%mm2\n"

"punpckhbw %%mm0,%%mm4\n"

"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 
"pmaddwd %%mm2,%%mm2\n"

"pmaddwd %%mm4,%%mm4\n"

"pmaddwd %%mm1,%%mm1\n"

"pmaddwd %%mm3,%%mm3\n"

"add %3,%0\n"

"add %3,%1\n"

"paddd %%mm2,%%mm1\n"

"paddd %%mm4,%%mm3\n"

"paddd %%mm1,%%mm7\n"

"paddd %%mm3,%%mm7\n"

"decl %%ecx\n"

"jnz 1b\n"

"movq %%mm7,%%mm1\n"

"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 
"paddd %%mm7,%%mm1\n"

"movd %%mm1,%2\n"

: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
: "r" ((x86_reg)line_size) , "m" (h) 
: "%ecx");

return tmp;

} 
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
int tmp;

__asm__ volatile (

"shr $1,%2\n"

"pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ 
"pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ 
"1:\n"

"movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][015] */ 
"movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][015] */ 
"movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][015] */ 
"movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][015] */ 
/* todo: mm1mm2, mm3mm4 */

/* algo: subtract mm1 from mm2 with saturation and vice versa */

/* OR the results to get absolute difference */

"movdqa %%xmm1,%%xmm5\n"

"movdqa %%xmm3,%%xmm6\n"

"psubusb %%xmm2,%%xmm1\n"

"psubusb %%xmm4,%%xmm3\n"

"psubusb %%xmm5,%%xmm2\n"

"psubusb %%xmm6,%%xmm4\n"

"por %%xmm1,%%xmm2\n"

"por %%xmm3,%%xmm4\n"

/* now convert to 16bit vectors so we can square them */

"movdqa %%xmm2,%%xmm1\n"

"movdqa %%xmm4,%%xmm3\n"

"punpckhbw %%xmm0,%%xmm2\n"

"punpckhbw %%xmm0,%%xmm4\n"

"punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ 
"punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ 
"pmaddwd %%xmm2,%%xmm2\n"

"pmaddwd %%xmm4,%%xmm4\n"

"pmaddwd %%xmm1,%%xmm1\n"

"pmaddwd %%xmm3,%%xmm3\n"

"lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ 
"lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ 
"paddd %%xmm2,%%xmm1\n"

"paddd %%xmm4,%%xmm3\n"

"paddd %%xmm1,%%xmm7\n"

"paddd %%xmm3,%%xmm7\n"

"decl %2\n"

"jnz 1b\n"

"movdqa %%xmm7,%%xmm1\n"

"psrldq $8, %%xmm7\n" /* shift hi qword to lo */ 
"paddd %%xmm1,%%xmm7\n"

"movdqa %%xmm7,%%xmm1\n"

"psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 
"paddd %%xmm1,%%xmm7\n"

"movd %%xmm7,%3\n"

: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 
: "r" ((x86_reg)line_size));

return tmp;

} 
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 
int tmp;

__asm__ volatile (

"movl %3,%%ecx\n"

"pxor %%mm7,%%mm7\n"

"pxor %%mm6,%%mm6\n"

"movq (%0),%%mm0\n"

"movq %%mm0, %%mm1\n"

"psllq $8, %%mm0\n"

"psrlq $8, %%mm1\n"

"psrlq $8, %%mm0\n"

"movq %%mm0, %%mm2\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm0\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm2\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm0\n"

"psubw %%mm3, %%mm2\n"

"add %2,%0\n"

"movq (%0),%%mm4\n"

"movq %%mm4, %%mm1\n"

"psllq $8, %%mm4\n"

"psrlq $8, %%mm1\n"

"psrlq $8, %%mm4\n"

"movq %%mm4, %%mm5\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm4\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm5\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm4\n"

"psubw %%mm3, %%mm5\n"

"psubw %%mm4, %%mm0\n"

"psubw %%mm5, %%mm2\n"

"pxor %%mm3, %%mm3\n"

"pxor %%mm1, %%mm1\n"

"pcmpgtw %%mm0, %%mm3\n\t"

"pcmpgtw %%mm2, %%mm1\n\t"

"pxor %%mm3, %%mm0\n"

"pxor %%mm1, %%mm2\n"

"psubw %%mm3, %%mm0\n"

"psubw %%mm1, %%mm2\n"

"paddw %%mm0, %%mm2\n"

"paddw %%mm2, %%mm6\n"

"add %2,%0\n"

"1:\n"

"movq (%0),%%mm0\n"

"movq %%mm0, %%mm1\n"

"psllq $8, %%mm0\n"

"psrlq $8, %%mm1\n"

"psrlq $8, %%mm0\n"

"movq %%mm0, %%mm2\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm0\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm2\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm0\n"

"psubw %%mm3, %%mm2\n"

"psubw %%mm0, %%mm4\n"

"psubw %%mm2, %%mm5\n"

"pxor %%mm3, %%mm3\n"

"pxor %%mm1, %%mm1\n"

"pcmpgtw %%mm4, %%mm3\n\t"

"pcmpgtw %%mm5, %%mm1\n\t"

"pxor %%mm3, %%mm4\n"

"pxor %%mm1, %%mm5\n"

"psubw %%mm3, %%mm4\n"

"psubw %%mm1, %%mm5\n"

"paddw %%mm4, %%mm5\n"

"paddw %%mm5, %%mm6\n"

"add %2,%0\n"

"movq (%0),%%mm4\n"

"movq %%mm4, %%mm1\n"

"psllq $8, %%mm4\n"

"psrlq $8, %%mm1\n"

"psrlq $8, %%mm4\n"

"movq %%mm4, %%mm5\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm4\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm5\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm4\n"

"psubw %%mm3, %%mm5\n"

"psubw %%mm4, %%mm0\n"

"psubw %%mm5, %%mm2\n"

"pxor %%mm3, %%mm3\n"

"pxor %%mm1, %%mm1\n"

"pcmpgtw %%mm0, %%mm3\n\t"

"pcmpgtw %%mm2, %%mm1\n\t"

"pxor %%mm3, %%mm0\n"

"pxor %%mm1, %%mm2\n"

"psubw %%mm3, %%mm0\n"

"psubw %%mm1, %%mm2\n"

"paddw %%mm0, %%mm2\n"

"paddw %%mm2, %%mm6\n"

"add %2,%0\n"

"subl $2, %%ecx\n"

" jnz 1b\n"

"movq %%mm6, %%mm0\n"

"punpcklwd %%mm7,%%mm0\n"

"punpckhwd %%mm7,%%mm6\n"

"paddd %%mm0, %%mm6\n"

"movq %%mm6,%%mm0\n"

"psrlq $32, %%mm6\n"

"paddd %%mm6,%%mm0\n"

"movd %%mm0,%1\n"

: "+r" (pix1), "=r"(tmp) 
: "r" ((x86_reg)line_size) , "g" (h2) 
: "%ecx");

return tmp;

} 
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 
int tmp;

uint8_t * pix= pix1; 
__asm__ volatile (

"movl %3,%%ecx\n"

"pxor %%mm7,%%mm7\n"

"pxor %%mm6,%%mm6\n"

"movq (%0),%%mm0\n"

"movq 1(%0),%%mm1\n"

"movq %%mm0, %%mm2\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm0\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm2\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm0\n"

"psubw %%mm3, %%mm2\n"

"add %2,%0\n"

"movq (%0),%%mm4\n"

"movq 1(%0),%%mm1\n"

"movq %%mm4, %%mm5\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm4\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm5\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm4\n"

"psubw %%mm3, %%mm5\n"

"psubw %%mm4, %%mm0\n"

"psubw %%mm5, %%mm2\n"

"pxor %%mm3, %%mm3\n"

"pxor %%mm1, %%mm1\n"

"pcmpgtw %%mm0, %%mm3\n\t"

"pcmpgtw %%mm2, %%mm1\n\t"

"pxor %%mm3, %%mm0\n"

"pxor %%mm1, %%mm2\n"

"psubw %%mm3, %%mm0\n"

"psubw %%mm1, %%mm2\n"

"paddw %%mm0, %%mm2\n"

"paddw %%mm2, %%mm6\n"

"add %2,%0\n"

"1:\n"

"movq (%0),%%mm0\n"

"movq 1(%0),%%mm1\n"

"movq %%mm0, %%mm2\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm0\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm2\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm0\n"

"psubw %%mm3, %%mm2\n"

"psubw %%mm0, %%mm4\n"

"psubw %%mm2, %%mm5\n"

"pxor %%mm3, %%mm3\n"

"pxor %%mm1, %%mm1\n"

"pcmpgtw %%mm4, %%mm3\n\t"

"pcmpgtw %%mm5, %%mm1\n\t"

"pxor %%mm3, %%mm4\n"

"pxor %%mm1, %%mm5\n"

"psubw %%mm3, %%mm4\n"

"psubw %%mm1, %%mm5\n"

"paddw %%mm4, %%mm5\n"

"paddw %%mm5, %%mm6\n"

"add %2,%0\n"

"movq (%0),%%mm4\n"

"movq 1(%0),%%mm1\n"

"movq %%mm4, %%mm5\n"

"movq %%mm1, %%mm3\n"

"punpcklbw %%mm7,%%mm4\n"

"punpcklbw %%mm7,%%mm1\n"

"punpckhbw %%mm7,%%mm5\n"

"punpckhbw %%mm7,%%mm3\n"

"psubw %%mm1, %%mm4\n"

"psubw %%mm3, %%mm5\n"

"psubw %%mm4, %%mm0\n"

"psubw %%mm5, %%mm2\n"

"pxor %%mm3, %%mm3\n"

"pxor %%mm1, %%mm1\n"

"pcmpgtw %%mm0, %%mm3\n\t"

"pcmpgtw %%mm2, %%mm1\n\t"

"pxor %%mm3, %%mm0\n"

"pxor %%mm1, %%mm2\n"

"psubw %%mm3, %%mm0\n"

"psubw %%mm1, %%mm2\n"

"paddw %%mm0, %%mm2\n"

"paddw %%mm2, %%mm6\n"

"add %2,%0\n"

"subl $2, %%ecx\n"

" jnz 1b\n"

"movq %%mm6, %%mm0\n"

"punpcklwd %%mm7,%%mm0\n"

"punpckhwd %%mm7,%%mm6\n"

"paddd %%mm0, %%mm6\n"

615  
616 
"movq %%mm6,%%mm0\n"

617 
"psrlq $32, %%mm6\n"

618 
"paddd %%mm6,%%mm0\n"

619 
"movd %%mm0,%1\n"

620 
: "+r" (pix1), "=r"(tmp) 
621 
: "r" ((x86_reg)line_size) , "g" (h2) 
622 
: "%ecx");

623 
return tmp + hf_noise8_mmx(pix+8, line_size, h); 
624 
} 
625  
626 
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
627 
MpegEncContext *c = p; 
628 
int score1, score2;

629  
630 
if(c) score1 = c>dsp.sse[0](c, pix1, pix2, line_size, h); 
631 
else score1 = sse16_mmx(c, pix1, pix2, line_size, h);

632 
score2= hf_noise16_mmx(pix1, line_size, h)  hf_noise16_mmx(pix2, line_size, h); 
633  
634 
if(c) return score1 + FFABS(score2)*c>avctx>nsse_weight; 
635 
else return score1 + FFABS(score2)*8; 
636 
} 
637  
638 
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
639 
MpegEncContext *c = p; 
640 
int score1= sse8_mmx(c, pix1, pix2, line_size, h);

641 
int score2= hf_noise8_mmx(pix1, line_size, h)  hf_noise8_mmx(pix2, line_size, h);

642  
643 
if(c) return score1 + FFABS(score2)*c>avctx>nsse_weight; 
644 
else return score1 + FFABS(score2)*8; 
645 
} 
646  
647 
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 
648 
int tmp;

649  
650 
assert( (((int)pix) & 7) == 0); 
651 
assert((line_size &7) ==0); 
652  
653 
#define SUM(in0, in1, out0, out1) \

654 
"movq (%0), %%mm2\n"\

655 
"movq 8(%0), %%mm3\n"\

656 
"add %2,%0\n"\

657 
"movq %%mm2, " #out0 "\n"\ 
658 
"movq %%mm3, " #out1 "\n"\ 
659 
"psubusb " #in0 ", %%mm2\n"\ 
660 
"psubusb " #in1 ", %%mm3\n"\ 
661 
"psubusb " #out0 ", " #in0 "\n"\ 
662 
"psubusb " #out1 ", " #in1 "\n"\ 
663 
"por %%mm2, " #in0 "\n"\ 
664 
"por %%mm3, " #in1 "\n"\ 
665 
"movq " #in0 ", %%mm2\n"\ 
666 
"movq " #in1 ", %%mm3\n"\ 
667 
"punpcklbw %%mm7, " #in0 "\n"\ 
668 
"punpcklbw %%mm7, " #in1 "\n"\ 
669 
"punpckhbw %%mm7, %%mm2\n"\

670 
"punpckhbw %%mm7, %%mm3\n"\

671 
"paddw " #in1 ", " #in0 "\n"\ 
672 
"paddw %%mm3, %%mm2\n"\

673 
"paddw %%mm2, " #in0 "\n"\ 
674 
"paddw " #in0 ", %%mm6\n" 
675  
676  
677 
__asm__ volatile (

678 
"movl %3,%%ecx\n"

679 
"pxor %%mm6,%%mm6\n"

680 
"pxor %%mm7,%%mm7\n"

681 
"movq (%0),%%mm0\n"

682 
"movq 8(%0),%%mm1\n"

683 
"add %2,%0\n"

684 
"jmp 2f\n"

685 
"1:\n"

686  
687 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
688 
"2:\n"

689 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
690  
691 
"subl $2, %%ecx\n"

692 
"jnz 1b\n"

693  
694 
"movq %%mm6,%%mm0\n"

695 
"psrlq $32, %%mm6\n"

696 
"paddw %%mm6,%%mm0\n"

697 
"movq %%mm0,%%mm6\n"

698 
"psrlq $16, %%mm0\n"

699 
"paddw %%mm6,%%mm0\n"

700 
"movd %%mm0,%1\n"

701 
: "+r" (pix), "=r"(tmp) 
702 
: "r" ((x86_reg)line_size) , "m" (h) 
703 
: "%ecx");

704 
return tmp & 0xFFFF; 
705 
} 
706 
#undef SUM

707  
708 
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 
709 
int tmp;

710  
711 
assert( (((int)pix) & 7) == 0); 
712 
assert((line_size &7) ==0); 
713  
714 
#define SUM(in0, in1, out0, out1) \

715 
"movq (%0), " #out0 "\n"\ 
716 
"movq 8(%0), " #out1 "\n"\ 
717 
"add %2,%0\n"\

718 
"psadbw " #out0 ", " #in0 "\n"\ 
719 
"psadbw " #out1 ", " #in1 "\n"\ 
720 
"paddw " #in1 ", " #in0 "\n"\ 
721 
"paddw " #in0 ", %%mm6\n" 
722  
723 
__asm__ volatile (

724 
"movl %3,%%ecx\n"

725 
"pxor %%mm6,%%mm6\n"

726 
"pxor %%mm7,%%mm7\n"

727 
"movq (%0),%%mm0\n"

728 
"movq 8(%0),%%mm1\n"

729 
"add %2,%0\n"

730 
"jmp 2f\n"

731 
"1:\n"

732  
733 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
734 
"2:\n"

735 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
736  
737 
"subl $2, %%ecx\n"

738 
"jnz 1b\n"

739  
740 
"movd %%mm6,%1\n"

741 
: "+r" (pix), "=r"(tmp) 
742 
: "r" ((x86_reg)line_size) , "m" (h) 
743 
: "%ecx");

744 
return tmp;

745 
} 
746 
#undef SUM

747  
748 
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
749 
int tmp;

750  
751 
assert( (((int)pix1) & 7) == 0); 
752 
assert( (((int)pix2) & 7) == 0); 
753 
assert((line_size &7) ==0); 
754  
755 
#define SUM(in0, in1, out0, out1) \

756 
"movq (%0),%%mm2\n"\

757 
"movq (%1)," #out0 "\n"\ 
758 
"movq 8(%0),%%mm3\n"\

759 
"movq 8(%1)," #out1 "\n"\ 
760 
"add %3,%0\n"\

761 
"add %3,%1\n"\

762 
"psubb " #out0 ", %%mm2\n"\ 
763 
"psubb " #out1 ", %%mm3\n"\ 
764 
"pxor %%mm7, %%mm2\n"\

765 
"pxor %%mm7, %%mm3\n"\

766 
"movq %%mm2, " #out0 "\n"\ 
767 
"movq %%mm3, " #out1 "\n"\ 
768 
"psubusb " #in0 ", %%mm2\n"\ 
769 
"psubusb " #in1 ", %%mm3\n"\ 
770 
"psubusb " #out0 ", " #in0 "\n"\ 
771 
"psubusb " #out1 ", " #in1 "\n"\ 
772 
"por %%mm2, " #in0 "\n"\ 
773 
"por %%mm3, " #in1 "\n"\ 
774 
"movq " #in0 ", %%mm2\n"\ 
775 
"movq " #in1 ", %%mm3\n"\ 
776 
"punpcklbw %%mm7, " #in0 "\n"\ 
777 
"punpcklbw %%mm7, " #in1 "\n"\ 
778 
"punpckhbw %%mm7, %%mm2\n"\

779 
"punpckhbw %%mm7, %%mm3\n"\

780 
"paddw " #in1 ", " #in0 "\n"\ 
781 
"paddw %%mm3, %%mm2\n"\

782 
"paddw %%mm2, " #in0 "\n"\ 
783 
"paddw " #in0 ", %%mm6\n" 
784  
785  
786 
__asm__ volatile (

787 
"movl %4,%%ecx\n"

788 
"pxor %%mm6,%%mm6\n"

789 
"pcmpeqw %%mm7,%%mm7\n"

790 
"psllw $15, %%mm7\n"

791 
"packsswb %%mm7, %%mm7\n"

792 
"movq (%0),%%mm0\n"

793 
"movq (%1),%%mm2\n"

794 
"movq 8(%0),%%mm1\n"

795 
"movq 8(%1),%%mm3\n"

796 
"add %3,%0\n"

797 
"add %3,%1\n"

798 
"psubb %%mm2, %%mm0\n"

799 
"psubb %%mm3, %%mm1\n"

800 
"pxor %%mm7, %%mm0\n"

801 
"pxor %%mm7, %%mm1\n"

802 
"jmp 2f\n"

803 
"1:\n"

804  
805 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
806 
"2:\n"

807 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
808  
809 
"subl $2, %%ecx\n"

810 
"jnz 1b\n"

811  
812 
"movq %%mm6,%%mm0\n"

813 
"psrlq $32, %%mm6\n"

814 
"paddw %%mm6,%%mm0\n"

815 
"movq %%mm0,%%mm6\n"

816 
"psrlq $16, %%mm0\n"

817 
"paddw %%mm6,%%mm0\n"

818 
"movd %%mm0,%2\n"

819 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
820 
: "r" ((x86_reg)line_size) , "m" (h) 
821 
: "%ecx");

822 
return tmp & 0x7FFF; 
823 
} 
824 
#undef SUM

825  
826 
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 
827 
int tmp;

828  
829 
assert( (((int)pix1) & 7) == 0); 
830 
assert( (((int)pix2) & 7) == 0); 
831 
assert((line_size &7) ==0); 
832  
833 
#define SUM(in0, in1, out0, out1) \

834 
"movq (%0)," #out0 "\n"\ 
835 
"movq (%1),%%mm2\n"\

836 
"movq 8(%0)," #out1 "\n"\ 
837 
"movq 8(%1),%%mm3\n"\

838 
"add %3,%0\n"\

839 
"add %3,%1\n"\

840 
"psubb %%mm2, " #out0 "\n"\ 
841 
"psubb %%mm3, " #out1 "\n"\ 
842 
"pxor %%mm7, " #out0 "\n"\ 
843 
"pxor %%mm7, " #out1 "\n"\ 
844 
"psadbw " #out0 ", " #in0 "\n"\ 
845 
"psadbw " #out1 ", " #in1 "\n"\ 
846 
"paddw " #in1 ", " #in0 "\n"\ 
847 
"paddw " #in0 ", %%mm6\n" 
848  
849 
__asm__ volatile (

850 
"movl %4,%%ecx\n"

851 
"pxor %%mm6,%%mm6\n"

852 
"pcmpeqw %%mm7,%%mm7\n"

853 
"psllw $15, %%mm7\n"

854 
"packsswb %%mm7, %%mm7\n"

855 
"movq (%0),%%mm0\n"

856 
"movq (%1),%%mm2\n"

857 
"movq 8(%0),%%mm1\n"

858 
"movq 8(%1),%%mm3\n"

859 
"add %3,%0\n"

860 
"add %3,%1\n"

861 
"psubb %%mm2, %%mm0\n"

862 
"psubb %%mm3, %%mm1\n"

863 
"pxor %%mm7, %%mm0\n"

864 
"pxor %%mm7, %%mm1\n"

865 
"jmp 2f\n"

866 
"1:\n"

867  
868 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 
869 
"2:\n"

870 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 
871  
872 
"subl $2, %%ecx\n"

873 
"jnz 1b\n"

874  
875 
"movd %%mm6,%2\n"

876 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 
877 
: "r" ((x86_reg)line_size) , "m" (h) 
878 
: "%ecx");

879 
return tmp;

880 
} 
881 
#undef SUM

882  
883 
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 
884 
x86_reg i=0;

885 
__asm__ volatile(

886 
"1: \n\t"

887 
"movq (%2, %0), %%mm0 \n\t"

888 
"movq (%1, %0), %%mm1 \n\t"

889 
"psubb %%mm0, %%mm1 \n\t"

890 
"movq %%mm1, (%3, %0) \n\t"

891 
"movq 8(%2, %0), %%mm0 \n\t"

892 
"movq 8(%1, %0), %%mm1 \n\t"

893 
"psubb %%mm0, %%mm1 \n\t"

894 
"movq %%mm1, 8(%3, %0) \n\t"

895 
"add $16, %0 \n\t"

896 
"cmp %4, %0 \n\t"

897 
" jb 1b \n\t"

898 
: "+r" (i)

899 
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w15) 
900 
); 
901 
for(; i<w; i++)

902 
dst[i+0] = src1[i+0]src2[i+0]; 
903 
} 
904  
905 
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){ 
906 
x86_reg i=0;

907 
uint8_t l, lt; 
908  
909 
__asm__ volatile(

910 
"1: \n\t"

911 
"movq 1(%1, %0), %%mm0 \n\t" // LT 
912 
"movq (%1, %0), %%mm1 \n\t" // T 
913 
"movq 1(%2, %0), %%mm2 \n\t" // L 
914 
"movq (%2, %0), %%mm3 \n\t" // X 
915 
"movq %%mm2, %%mm4 \n\t" // L 
916 
"psubb %%mm0, %%mm2 \n\t"

917 
"paddb %%mm1, %%mm2 \n\t" // L + T  LT 
918 
"movq %%mm4, %%mm5 \n\t" // L 
919 
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 
920 
"pminub %%mm5, %%mm1 \n\t" // min(T, L) 
921 
"pminub %%mm2, %%mm4 \n\t"

922 
"pmaxub %%mm1, %%mm4 \n\t"

923 
"psubb %%mm4, %%mm3 \n\t" // dst  pred 
924 
"movq %%mm3, (%3, %0) \n\t"

925 
"add $8, %0 \n\t"

926 
"cmp %4, %0 \n\t"

927 
" jb 1b \n\t"

928 
: "+r" (i)

929 
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) 
930 
); 
931  
932 
l= *left; 
933 
lt= *left_top; 
934  
935 
dst[0]= src2[0]  mid_pred(l, src1[0], (l + src1[0]  lt)&0xFF); 
936  
937 
*left_top= src1[w1];

938 
*left = src2[w1];

939 
} 
940  
941 
#define DIFF_PIXELS_1(m,a,t,p1,p2)\

942 
"mov"#m" "#p1", "#a" \n\t"\ 
943 
"mov"#m" "#p2", "#t" \n\t"\ 
944 
"punpcklbw "#a", "#t" \n\t"\ 
945 
"punpcklbw "#a", "#a" \n\t"\ 
946 
"psubw "#t", "#a" \n\t"\ 
947  
948 
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\

949 
uint8_t *p1b=p1, *p2b=p2;\ 
950 
__asm__ volatile(\

951 
DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ 
952 
DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ 
953 
DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ 
954 
"add %4, %1 \n\t"\

955 
"add %4, %2 \n\t"\

956 
DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ 
957 
DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ 
958 
DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ 
959 
DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ 
960 
"mov"#m1" "#mm"0, %0 \n\t"\ 
961 
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ 
962 
"mov"#m1" %0, "#mm"0 \n\t"\ 
963 
: "+m"(temp), "+r"(p1b), "+r"(p2b)\ 
964 
: "r"((x86_reg)stride), "r"((x86_reg)stride*3)\ 
965 
);\ 
966 
} 
967 
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)

968  
969 
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)

970 
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)

971  
972 
#define LBUTTERFLY2(a1,b1,a2,b2)\

973 
"paddw " #b1 ", " #a1 " \n\t"\ 
974 
"paddw " #b2 ", " #a2 " \n\t"\ 
975 
"paddw " #b1 ", " #b1 " \n\t"\ 
976 
"paddw " #b2 ", " #b2 " \n\t"\ 
977 
"psubw " #a1 ", " #b1 " \n\t"\ 
978 
"psubw " #a2 ", " #b2 " \n\t" 
979  
980 
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\

981 
LBUTTERFLY2(m0, m1, m2, m3)\ 
982 
LBUTTERFLY2(m4, m5, m6, m7)\ 
983 
LBUTTERFLY2(m0, m2, m1, m3)\ 
984 
LBUTTERFLY2(m4, m6, m5, m7)\ 
985 
LBUTTERFLY2(m0, m4, m1, m5)\ 
986 
LBUTTERFLY2(m2, m6, m3, m7)\ 
987  
988 
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)

989  
990 
#define MMABS_MMX(a,z)\

991 
"pxor " #z ", " #z " \n\t"\ 
992 
"pcmpgtw " #a ", " #z " \n\t"\ 
993 
"pxor " #z ", " #a " \n\t"\ 
994 
"psubw " #z ", " #a " \n\t" 
995  
996 
#define MMABS_MMX2(a,z)\

997 
"pxor " #z ", " #z " \n\t"\ 
998 
"psubw " #a ", " #z " \n\t"\ 
999 
"pmaxsw " #z ", " #a " \n\t" 
1000  
1001 
#define MMABS_SSSE3(a,z)\

1002 
"pabsw " #a ", " #a " \n\t" 
1003  
1004 
#define MMABS_SUM(a,z, sum)\

1005 
MMABS(a,z)\ 
1006 
"paddusw " #a ", " #sum " \n\t" 
1007  
1008 
#define MMABS_SUM_8x8_NOSPILL\

1009 
MMABS(%%xmm0, %%xmm8)\ 
1010 
MMABS(%%xmm1, %%xmm9)\ 
1011 
MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ 
1012 
MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ 
1013 
MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ 
1014 
MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ 
1015 
MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ 
1016 
MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ 
1017 
"paddusw %%xmm1, %%xmm0 \n\t"

1018  
1019 
#if ARCH_X86_64

1020 
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL

1021 
#else

1022 
#define MMABS_SUM_8x8_SSE2\

1023 
"movdqa %%xmm7, (%1) \n\t"\

1024 
MMABS(%%xmm0, %%xmm7)\ 
1025 
MMABS(%%xmm1, %%xmm7)\ 
1026 
MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ 
1027 
MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ 
1028 
MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ 
1029 
MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ 
1030 
MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ 
1031 
"movdqa (%1), %%xmm2 \n\t"\

1032 
MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ 
1033 
"paddusw %%xmm1, %%xmm0 \n\t"

1034 
#endif

1035  
1036 
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to

1037 
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,

1038 
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */

1039 
#define HSUM_MMX(a, t, dst)\

1040 
"movq "#a", "#t" \n\t"\ 
1041 
"psrlq $32, "#a" \n\t"\ 
1042 
"paddusw "#t", "#a" \n\t"\ 
1043 
"movq "#a", "#t" \n\t"\ 
1044 
"psrlq $16, "#a" \n\t"\ 
1045 
"paddusw "#t", "#a" \n\t"\ 
1046 
"movd "#a", "#dst" \n\t"\ 
1047  
1048 
#define HSUM_MMX2(a, t, dst)\

1049 
"pshufw $0x0E, "#a", "#t" \n\t"\ 
1050 
"paddusw "#t", "#a" \n\t"\ 
1051 
"pshufw $0x01, "#a", "#t" \n\t"\ 
1052 
"paddusw "#t", "#a" \n\t"\ 
1053 
"movd "#a", "#dst" \n\t"\ 
1054  
1055 
#define HSUM_SSE2(a, t, dst)\

1056 
"movhlps "#a", "#t" \n\t"\ 
1057 
"paddusw "#t", "#a" \n\t"\ 
1058 
"pshuflw $0x0E, "#a", "#t" \n\t"\ 
1059 
"paddusw "#t", "#a" \n\t"\ 
1060 
"pshuflw $0x01, "#a", "#t" \n\t"\ 
1061 
"paddusw "#t", "#a" \n\t"\ 
1062 
"movd "#a", "#dst" \n\t"\ 
1063  
1064 
#define HADAMARD8_DIFF_MMX(cpu) \

1065 
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 
1066 
DECLARE_ALIGNED(8, uint64_t, temp)[13];\ 
1067 
int sum;\

1068 
\ 
1069 
assert(h==8);\

1070 
\ 
1071 
DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\

1072 
\ 
1073 
__asm__ volatile(\

1074 
HADAMARD48\ 
1075 
\ 
1076 
"movq %%mm7, 96(%1) \n\t"\

1077 
\ 
1078 
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 
1079 
STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 
1080 
\ 
1081 
"movq 96(%1), %%mm7 \n\t"\

1082 
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 
1083 
STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\ 
1084 
\ 
1085 
: "=r" (sum)\

1086 
: "r"(temp)\

1087 
);\ 
1088 
\ 
1089 
DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ 
1090 
\ 
1091 
__asm__ volatile(\

1092 
HADAMARD48\ 
1093 
\ 
1094 
"movq %%mm7, 96(%1) \n\t"\

1095 
\ 
1096 
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 
1097 
STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\ 
1098 
\ 
1099 
"movq 96(%1), %%mm7 \n\t"\

1100 
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 
1101 
"movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ 
1102 
"movq %%mm6, %%mm7 \n\t"\

1103 
"movq %%mm0, %%mm6 \n\t"\

1104 
\ 
1105 
LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 
1106 
\ 
1107 
HADAMARD48\ 
1108 
"movq %%mm7, 64(%1) \n\t"\

1109 
MMABS(%%mm0, %%mm7)\ 
1110 
MMABS(%%mm1, %%mm7)\ 
1111 
MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 
1112 
MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 
1113 
MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 
1114 
MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 
1115 
MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 
1116 
"movq 64(%1), %%mm2 \n\t"\

1117 
MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 
1118 
"paddusw %%mm1, %%mm0 \n\t"\

1119 
"movq %%mm0, 64(%1) \n\t"\

1120 
\ 
1121 
LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\ 
1122 
LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\ 
1123 
\ 
1124 
HADAMARD48\ 
1125 
"movq %%mm7, (%1) \n\t"\

1126 
MMABS(%%mm0, %%mm7)\ 
1127 
MMABS(%%mm1, %%mm7)\ 
1128 
MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 
1129 
MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 
1130 
MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 
1131 
MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 
1132 
MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 
1133 
"movq (%1), %%mm2 \n\t"\

1134 
MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 
1135 
"paddusw 64(%1), %%mm0 \n\t"\

1136 
"paddusw %%mm1, %%mm0 \n\t"\

1137 
\ 
1138 
HSUM(%%mm0, %%mm1, %0)\

1139 
\ 
1140 
: "=r" (sum)\

1141 
: "r"(temp)\

1142 
);\ 
1143 
return sum&0xFFFF;\ 
1144 
}\ 
1145 
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 
1146  
1147 
#define HADAMARD8_DIFF_SSE2(cpu) \

1148 
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 
1149 
DECLARE_ALIGNED(16, uint64_t, temp)[4];\ 
1150 
int sum;\

1151 
\ 
1152 
assert(h==8);\

1153 
\ 
1154 
DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\

1155 
\ 
1156 
__asm__ volatile(\

1157 
HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ 
1158 
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\

1159 
HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ 
1160 
MMABS_SUM_8x8\ 
1161 
HSUM_SSE2(%%xmm0, %%xmm1, %0)\

1162 
: "=r" (sum)\

1163 
: "r"(temp)\

1164 
);\ 
1165 
return sum&0xFFFF;\ 
1166 
}\ 
1167 
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 
1168  
1169 
#define MMABS(a,z) MMABS_MMX(a,z)

1170 
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)

1171 
HADAMARD8_DIFF_MMX(mmx) 
1172 
#undef MMABS

1173 
#undef HSUM

1174  
1175 
#define MMABS(a,z) MMABS_MMX2(a,z)

1176 
#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2

1177 
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)

1178 
HADAMARD8_DIFF_MMX(mmx2) 
1179 
HADAMARD8_DIFF_SSE2(sse2) 
1180 
#undef MMABS

1181 
#undef MMABS_SUM_8x8

1182 
#undef HSUM

1183  
1184 
#if HAVE_SSSE3

1185 
#define MMABS(a,z) MMABS_SSSE3(a,z)

1186 
#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL

1187 
HADAMARD8_DIFF_SSE2(ssse3) 
1188 
#undef MMABS

1189 
#undef MMABS_SUM_8x8

1190 
#endif

1191  
1192 
#define DCT_SAD4(m,mm,o)\

1193 
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 
1194 
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 
1195 
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 
1196 
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 
1197 
MMABS_SUM(mm##2, mm##6, mm##0)\ 
1198 
MMABS_SUM(mm##3, mm##7, mm##1)\ 
1199 
MMABS_SUM(mm##4, mm##6, mm##0)\ 
1200 
MMABS_SUM(mm##5, mm##7, mm##1)\ 
1201  
1202 
#define DCT_SAD_MMX\

1203 
"pxor %%mm0, %%mm0 \n\t"\

1204 
"pxor %%mm1, %%mm1 \n\t"\

1205 
DCT_SAD4(q, %%mm, 0)\

1206 
DCT_SAD4(q, %%mm, 8)\

1207 
DCT_SAD4(q, %%mm, 64)\

1208 
DCT_SAD4(q, %%mm, 72)\

1209 
"paddusw %%mm1, %%mm0 \n\t"\

1210 
HSUM(%%mm0, %%mm1, %0)

1211  
1212 
#define DCT_SAD_SSE2\

1213 
"pxor %%xmm0, %%xmm0 \n\t"\

1214 
"pxor %%xmm1, %%xmm1 \n\t"\

1215 
DCT_SAD4(dqa, %%xmm, 0)\

1216 
DCT_SAD4(dqa, %%xmm, 64)\

1217 
"paddusw %%xmm1, %%xmm0 \n\t"\

1218 
HSUM(%%xmm0, %%xmm1, %0)

1219  
1220 
#define DCT_SAD_FUNC(cpu) \

1221 
static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 
1222 
int sum;\

1223 
__asm__ volatile(\

1224 
DCT_SAD\ 
1225 
:"=r"(sum)\

1226 
:"r"(block)\

1227 
);\ 
1228 
return sum&0xFFFF;\ 
1229 
} 
1230  
1231 
#define DCT_SAD DCT_SAD_MMX

1232 
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)

1233 
#define MMABS(a,z) MMABS_MMX(a,z)

1234 
DCT_SAD_FUNC(mmx) 
1235 
#undef MMABS

1236 
#undef HSUM

1237  
1238 
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)

1239 
#define MMABS(a,z) MMABS_MMX2(a,z)

1240 
DCT_SAD_FUNC(mmx2) 
1241 
#undef HSUM

1242 
#undef DCT_SAD

1243  
1244 
#define DCT_SAD DCT_SAD_SSE2

1245 
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)

1246 
DCT_SAD_FUNC(sse2) 
1247 
#undef MMABS

1248  
1249 
#if HAVE_SSSE3

1250 
#define MMABS(a,z) MMABS_SSSE3(a,z)

1251 
DCT_SAD_FUNC(ssse3) 
1252 
#undef MMABS

1253 
#endif

1254 
#undef HSUM

1255 
#undef DCT_SAD

1256  
1257 
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 
1258 
int sum;

1259 
x86_reg i=size; 
1260 
__asm__ volatile(

1261 
"pxor %%mm4, %%mm4 \n"

1262 
"1: \n"

1263 
"sub $8, %0 \n"

1264 
"movq (%2,%0), %%mm2 \n"

1265 
"movq (%3,%0,2), %%mm0 \n"

1266 
"movq 8(%3,%0,2), %%mm1 \n"

1267 
"punpckhbw %%mm2, %%mm3 \n"

1268 
"punpcklbw %%mm2, %%mm2 \n"

1269 
"psraw $8, %%mm3 \n"

1270 
"psraw $8, %%mm2 \n"

1271 
"psubw %%mm3, %%mm1 \n"

1272 
"psubw %%mm2, %%mm0 \n"

1273 
"pmaddwd %%mm1, %%mm1 \n"

1274 
"pmaddwd %%mm0, %%mm0 \n"

1275 
"paddd %%mm1, %%mm4 \n"

1276 
"paddd %%mm0, %%mm4 \n"

1277 
"jg 1b \n"

1278 
"movq %%mm4, %%mm3 \n"

1279 
"psrlq $32, %%mm3 \n"

1280 
"paddd %%mm3, %%mm4 \n"

1281 
"movd %%mm4, %1 \n"

1282 
:"+r"(i), "=r"(sum) 
1283 
:"r"(pix1), "r"(pix2) 
1284 
); 
1285 
return sum;

1286 
} 
1287  
1288 
#define PHADDD(a, t)\

1289 
"movq "#a", "#t" \n\t"\ 
1290 
"psrlq $32, "#a" \n\t"\ 
1291 
"paddd "#t", "#a" \n\t" 
1292 
/*

1293 
pmulhw: dst[015]=(src[015]*dst[015])[1631]

1294 
pmulhrw: dst[015]=(src[015]*dst[015] + 0x8000)[1631]

1295 
pmulhrsw: dst[015]=(src[015]*dst[015] + 0x4000)[1530]

1296 
*/

1297 
#define PMULHRW(x, y, s, o)\

1298 
"pmulhw " #s ", "#x " \n\t"\ 
1299 
"pmulhw " #s ", "#y " \n\t"\ 
1300 
"paddw " #o ", "#x " \n\t"\ 
1301 
"paddw " #o ", "#y " \n\t"\ 
1302 
"psraw $1, "#x " \n\t"\ 
1303 
"psraw $1, "#y " \n\t" 
1304 
#define DEF(x) x ## _mmx 
1305 
#define SET_RND MOVQ_WONE

1306 
#define SCALE_OFFSET 1 
1307  
1308 
#include "dsputil_mmx_qns_template.c" 
1309  
1310 
#undef DEF

1311 
#undef SET_RND

1312 
#undef SCALE_OFFSET

1313 
#undef PMULHRW

1314  
1315 
#define DEF(x) x ## _3dnow 
1316 
#define SET_RND(x)

1317 
#define SCALE_OFFSET 0 
1318 
#define PMULHRW(x, y, s, o)\

1319 
"pmulhrw " #s ", "#x " \n\t"\ 
1320 
"pmulhrw " #s ", "#y " \n\t" 
1321  
1322 
#include "dsputil_mmx_qns_template.c" 
1323  
1324 
#undef DEF

1325 
#undef SET_RND

1326 
#undef SCALE_OFFSET

1327 
#undef PMULHRW

1328  
1329 
#if HAVE_SSSE3

1330 
#undef PHADDD

1331 
#define DEF(x) x ## _ssse3 
1332 
#define SET_RND(x)

1333 
#define SCALE_OFFSET 1 
1334 
#define PHADDD(a, t)\

1335 
"pshufw $0x0E, "#a", "#t" \n\t"\ 
1336 
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 
1337 
#define PMULHRW(x, y, s, o)\

1338 
"pmulhrsw " #s ", "#x " \n\t"\ 
1339 
"pmulhrsw " #s ", "#y " \n\t" 
1340  
1341 
#include "dsputil_mmx_qns_template.c" 
1342  
1343 
#undef DEF

1344 
#undef SET_RND

1345 
#undef SCALE_OFFSET

1346 
#undef PMULHRW

1347 
#undef PHADDD

1348 
#endif //HAVE_SSSE3 
1349  
1350  
1351 
void ff_lpc_compute_autocorr_sse2(const int32_t *data, int len, int lag, 
1352 
double *autoc);

1353  
1354  
1355 
void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)

1356 
{ 
1357 
if (mm_flags & FF_MM_MMX) {

1358 
const int dct_algo = avctx>dct_algo; 
1359 
if(dct_algo==FF_DCT_AUTO  dct_algo==FF_DCT_MMX){

1360 
if(mm_flags & FF_MM_SSE2){

1361 
c>fdct = ff_fdct_sse2; 
1362 
}else if(mm_flags & FF_MM_MMX2){ 
1363 
c>fdct = ff_fdct_mmx2; 
1364 
}else{

1365 
c>fdct = ff_fdct_mmx; 
1366 
} 
1367 
} 
1368  
1369 
c>get_pixels = get_pixels_mmx; 
1370 
c>diff_pixels = diff_pixels_mmx; 
1371 
c>pix_sum = pix_sum16_mmx; 
1372  
1373 
c>diff_bytes= diff_bytes_mmx; 
1374 
c>sum_abs_dctelem= sum_abs_dctelem_mmx; 
1375  
1376 
c>hadamard8_diff[0]= hadamard8_diff16_mmx;

1377 
c>hadamard8_diff[1]= hadamard8_diff_mmx;

1378  
1379 
c>pix_norm1 = pix_norm1_mmx; 
1380 
c>sse[0] = (mm_flags & FF_MM_SSE2) ? sse16_sse2 : sse16_mmx;

1381 
c>sse[1] = sse8_mmx;

1382 
c>vsad[4]= vsad_intra16_mmx;

1383  
1384 
c>nsse[0] = nsse16_mmx;

1385 
c>nsse[1] = nsse8_mmx;

1386 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1387 
c>vsad[0] = vsad16_mmx;

1388 
} 
1389  
1390 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1391 
c>try_8x8basis= try_8x8basis_mmx; 
1392 
} 
1393 
c>add_8x8basis= add_8x8basis_mmx; 
1394  
1395 
c>ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 
1396  
1397  
1398 
if (mm_flags & FF_MM_MMX2) {

1399 
c>sum_abs_dctelem= sum_abs_dctelem_mmx2; 
1400 
c>hadamard8_diff[0]= hadamard8_diff16_mmx2;

1401 
c>hadamard8_diff[1]= hadamard8_diff_mmx2;

1402 
c>vsad[4]= vsad_intra16_mmx2;

1403  
1404 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1405 
c>vsad[0] = vsad16_mmx2;

1406 
} 
1407  
1408 
c>sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 
1409 
} 
1410  
1411 
if(mm_flags & FF_MM_SSE2){

1412 
c>get_pixels = get_pixels_sse2; 
1413 
c>sum_abs_dctelem= sum_abs_dctelem_sse2; 
1414 
c>hadamard8_diff[0]= hadamard8_diff16_sse2;

1415 
c>hadamard8_diff[1]= hadamard8_diff_sse2;

1416 
#if CONFIG_LPC

1417 
c>lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2; 
1418 
#endif

1419 
} 
1420  
1421 
#if HAVE_SSSE3

1422 
if(mm_flags & FF_MM_SSSE3){

1423 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1424 
c>try_8x8basis= try_8x8basis_ssse3; 
1425 
} 
1426 
c>add_8x8basis= add_8x8basis_ssse3; 
1427 
c>sum_abs_dctelem= sum_abs_dctelem_ssse3; 
1428 
c>hadamard8_diff[0]= hadamard8_diff16_ssse3;

1429 
c>hadamard8_diff[1]= hadamard8_diff_ssse3;

1430 
} 
1431 
#endif

1432  
1433 
if(mm_flags & FF_MM_3DNOW){

1434 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){

1435 
c>try_8x8basis= try_8x8basis_3dnow; 
1436 
} 
1437 
c>add_8x8basis= add_8x8basis_3dnow; 
1438 
} 
1439 
} 
1440  
1441 
dsputil_init_pix_mmx(c, avctx); 
1442 
} 