Revision 97d1d009 libavcodec/i386/dsputil_mmx.c
libavcodec/i386/dsputil_mmx.c  

74  74 
#define JUMPALIGN() asm volatile (ASMALIGN(3)::) 
75  75 
#define MOVQ_ZERO(regd) asm volatile ("pxor %%" #regd ", %%" #regd ::) 
76  76  
77 
#define MOVQ_WONE(regd) \ 

78 
asm volatile ( \ 

79 
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \ 

80 
"psrlw $15, %%" #regd ::) 

81  
82  77 
#define MOVQ_BFE(regd) \ 
83  78 
asm volatile ( \ 
84  79 
"pcmpeqd %%" #regd ", %%" #regd " \n\t"\ 
...  ...  
220  215 
/***********************************/ 
221  216 
/* standard MMX */ 
222  217  
223 
#ifdef CONFIG_ENCODERS 

224 
static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) 

225 
{ 

226 
asm volatile( 

227 
"mov $128, %%"REG_a" \n\t" 

228 
"pxor %%mm7, %%mm7 \n\t" 

229 
ASMALIGN(4) 

230 
"1: \n\t" 

231 
"movq (%0), %%mm0 \n\t" 

232 
"movq (%0, %2), %%mm2 \n\t" 

233 
"movq %%mm0, %%mm1 \n\t" 

234 
"movq %%mm2, %%mm3 \n\t" 

235 
"punpcklbw %%mm7, %%mm0 \n\t" 

236 
"punpckhbw %%mm7, %%mm1 \n\t" 

237 
"punpcklbw %%mm7, %%mm2 \n\t" 

238 
"punpckhbw %%mm7, %%mm3 \n\t" 

239 
"movq %%mm0, (%1, %%"REG_a") \n\t" 

240 
"movq %%mm1, 8(%1, %%"REG_a") \n\t" 

241 
"movq %%mm2, 16(%1, %%"REG_a") \n\t" 

242 
"movq %%mm3, 24(%1, %%"REG_a") \n\t" 

243 
"add %3, %0 \n\t" 

244 
"add $32, %%"REG_a" \n\t" 

245 
"js 1b \n\t" 

246 
: "+r" (pixels) 

247 
: "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) 

248 
: "%"REG_a 

249 
); 

250 
} 

251  
252 
static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) 

253 
{ 

254 
asm volatile( 

255 
"pxor %%mm7, %%mm7 \n\t" 

256 
"mov $128, %%"REG_a" \n\t" 

257 
ASMALIGN(4) 

258 
"1: \n\t" 

259 
"movq (%0), %%mm0 \n\t" 

260 
"movq (%1), %%mm2 \n\t" 

261 
"movq %%mm0, %%mm1 \n\t" 

262 
"movq %%mm2, %%mm3 \n\t" 

263 
"punpcklbw %%mm7, %%mm0 \n\t" 

264 
"punpckhbw %%mm7, %%mm1 \n\t" 

265 
"punpcklbw %%mm7, %%mm2 \n\t" 

266 
"punpckhbw %%mm7, %%mm3 \n\t" 

267 
"psubw %%mm2, %%mm0 \n\t" 

268 
"psubw %%mm3, %%mm1 \n\t" 

269 
"movq %%mm0, (%2, %%"REG_a") \n\t" 

270 
"movq %%mm1, 8(%2, %%"REG_a") \n\t" 

271 
"add %3, %0 \n\t" 

272 
"add %3, %1 \n\t" 

273 
"add $16, %%"REG_a" \n\t" 

274 
"jnz 1b \n\t" 

275 
: "+r" (s1), "+r" (s2) 

276 
: "r" (block+64), "r" ((long)stride) 

277 
: "%"REG_a 

278 
); 

279 
} 

280 
#endif //CONFIG_ENCODERS 

281  
282  218 
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) 
283  219 
{ 
284  220 
const DCTELEM *p; 
...  ...  
544  480 
); 
545  481 
} 
546  482  
547 
#ifdef CONFIG_ENCODERS 

548 
static int pix_sum16_mmx(uint8_t * pix, int line_size){ 

549 
const int h=16; 

550 
int sum; 

551 
long index= line_size*h; 

552  
553 
asm volatile( 

554 
"pxor %%mm7, %%mm7 \n\t" 

555 
"pxor %%mm6, %%mm6 \n\t" 

556 
"1: \n\t" 

557 
"movq (%2, %1), %%mm0 \n\t" 

558 
"movq (%2, %1), %%mm1 \n\t" 

559 
"movq 8(%2, %1), %%mm2 \n\t" 

560 
"movq 8(%2, %1), %%mm3 \n\t" 

561 
"punpcklbw %%mm7, %%mm0 \n\t" 

562 
"punpckhbw %%mm7, %%mm1 \n\t" 

563 
"punpcklbw %%mm7, %%mm2 \n\t" 

564 
"punpckhbw %%mm7, %%mm3 \n\t" 

565 
"paddw %%mm0, %%mm1 \n\t" 

566 
"paddw %%mm2, %%mm3 \n\t" 

567 
"paddw %%mm1, %%mm3 \n\t" 

568 
"paddw %%mm3, %%mm6 \n\t" 

569 
"add %3, %1 \n\t" 

570 
" js 1b \n\t" 

571 
"movq %%mm6, %%mm5 \n\t" 

572 
"psrlq $32, %%mm6 \n\t" 

573 
"paddw %%mm5, %%mm6 \n\t" 

574 
"movq %%mm6, %%mm5 \n\t" 

575 
"psrlq $16, %%mm6 \n\t" 

576 
"paddw %%mm5, %%mm6 \n\t" 

577 
"movd %%mm6, %0 \n\t" 

578 
"andl $0xFFFF, %0 \n\t" 

579 
: "=&r" (sum), "+r" (index) 

580 
: "r" (pix  index), "r" ((long)line_size) 

581 
); 

582  
583 
return sum; 

584 
} 

585 
#endif //CONFIG_ENCODERS 

586  
587  483 
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ 
588  484 
long i=0; 
589  485 
asm volatile( 
...  ...  
800  696 
} 
801  697 
} 
802  698  
803 
#ifdef CONFIG_ENCODERS 

804 
static int pix_norm1_mmx(uint8_t *pix, int line_size) { 

805 
int tmp; 

806 
asm volatile ( 

807 
"movl $16,%%ecx\n" 

808 
"pxor %%mm0,%%mm0\n" 

809 
"pxor %%mm7,%%mm7\n" 

810 
"1:\n" 

811 
"movq (%0),%%mm2\n" /* mm2 = pix[07] */ 

812 
"movq 8(%0),%%mm3\n" /* mm3 = pix[815] */ 

813  
814 
"movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[07] */ 

815  
816 
"punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix47] */ 

817 
"punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix03] */ 

818  
819 
"movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[815] */ 

820 
"punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix1215] */ 

821 
"punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix811] */ 

822  
823 
"pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */ 

824 
"pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */ 

825  
826 
"pmaddwd %%mm3,%%mm3\n" 

827 
"pmaddwd %%mm4,%%mm4\n" 

828  
829 
"paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2, 

830 
pix2^2+pix3^2+pix6^2+pix7^2) */ 

831 
"paddd %%mm3,%%mm4\n" 

832 
"paddd %%mm2,%%mm7\n" 

833  
834 
"add %2, %0\n" 

835 
"paddd %%mm4,%%mm7\n" 

836 
"dec %%ecx\n" 

837 
"jnz 1b\n" 

838  
839 
"movq %%mm7,%%mm1\n" 

840 
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 

841 
"paddd %%mm7,%%mm1\n" 

842 
"movd %%mm1,%1\n" 

843 
: "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" ); 

844 
return tmp; 

845 
} 

846  
847 
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

848 
int tmp; 

849 
asm volatile ( 

850 
"movl %4,%%ecx\n" 

851 
"shr $1,%%ecx\n" 

852 
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 

853 
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 

854 
"1:\n" 

855 
"movq (%0),%%mm1\n" /* mm1 = pix1[0][07] */ 

856 
"movq (%1),%%mm2\n" /* mm2 = pix2[0][07] */ 

857 
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][07] */ 

858 
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][07] */ 

859  
860 
/* todo: mm1mm2, mm3mm4 */ 

861 
/* algo: subtract mm1 from mm2 with saturation and vice versa */ 

862 
/* OR the results to get absolute difference */ 

863 
"movq %%mm1,%%mm5\n" 

864 
"movq %%mm3,%%mm6\n" 

865 
"psubusb %%mm2,%%mm1\n" 

866 
"psubusb %%mm4,%%mm3\n" 

867 
"psubusb %%mm5,%%mm2\n" 

868 
"psubusb %%mm6,%%mm4\n" 

869  
870 
"por %%mm1,%%mm2\n" 

871 
"por %%mm3,%%mm4\n" 

872  
873 
/* now convert to 16bit vectors so we can square them */ 

874 
"movq %%mm2,%%mm1\n" 

875 
"movq %%mm4,%%mm3\n" 

876  
877 
"punpckhbw %%mm0,%%mm2\n" 

878 
"punpckhbw %%mm0,%%mm4\n" 

879 
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 

880 
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 

881  
882 
"pmaddwd %%mm2,%%mm2\n" 

883 
"pmaddwd %%mm4,%%mm4\n" 

884 
"pmaddwd %%mm1,%%mm1\n" 

885 
"pmaddwd %%mm3,%%mm3\n" 

886  
887 
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ 

888 
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ 

889  
890 
"paddd %%mm2,%%mm1\n" 

891 
"paddd %%mm4,%%mm3\n" 

892 
"paddd %%mm1,%%mm7\n" 

893 
"paddd %%mm3,%%mm7\n" 

894  
895 
"decl %%ecx\n" 

896 
"jnz 1b\n" 

897  
898 
"movq %%mm7,%%mm1\n" 

899 
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 

900 
"paddd %%mm7,%%mm1\n" 

901 
"movd %%mm1,%2\n" 

902 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 

903 
: "r" ((long)line_size) , "m" (h) 

904 
: "%ecx"); 

905 
return tmp; 

906 
} 

907  
908 
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

909 
int tmp; 

910 
asm volatile ( 

911 
"movl %4,%%ecx\n" 

912 
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ 

913 
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ 

914 
"1:\n" 

915 
"movq (%0),%%mm1\n" /* mm1 = pix1[07] */ 

916 
"movq (%1),%%mm2\n" /* mm2 = pix2[07] */ 

917 
"movq 8(%0),%%mm3\n" /* mm3 = pix1[815] */ 

918 
"movq 8(%1),%%mm4\n" /* mm4 = pix2[815] */ 

919  
920 
/* todo: mm1mm2, mm3mm4 */ 

921 
/* algo: subtract mm1 from mm2 with saturation and vice versa */ 

922 
/* OR the results to get absolute difference */ 

923 
"movq %%mm1,%%mm5\n" 

924 
"movq %%mm3,%%mm6\n" 

925 
"psubusb %%mm2,%%mm1\n" 

926 
"psubusb %%mm4,%%mm3\n" 

927 
"psubusb %%mm5,%%mm2\n" 

928 
"psubusb %%mm6,%%mm4\n" 

929  
930 
"por %%mm1,%%mm2\n" 

931 
"por %%mm3,%%mm4\n" 

932  
933 
/* now convert to 16bit vectors so we can square them */ 

934 
"movq %%mm2,%%mm1\n" 

935 
"movq %%mm4,%%mm3\n" 

936  
937 
"punpckhbw %%mm0,%%mm2\n" 

938 
"punpckhbw %%mm0,%%mm4\n" 

939 
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ 

940 
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ 

941  
942 
"pmaddwd %%mm2,%%mm2\n" 

943 
"pmaddwd %%mm4,%%mm4\n" 

944 
"pmaddwd %%mm1,%%mm1\n" 

945 
"pmaddwd %%mm3,%%mm3\n" 

946  
947 
"add %3,%0\n" 

948 
"add %3,%1\n" 

949  
950 
"paddd %%mm2,%%mm1\n" 

951 
"paddd %%mm4,%%mm3\n" 

952 
"paddd %%mm1,%%mm7\n" 

953 
"paddd %%mm3,%%mm7\n" 

954  
955 
"decl %%ecx\n" 

956 
"jnz 1b\n" 

957  
958 
"movq %%mm7,%%mm1\n" 

959 
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ 

960 
"paddd %%mm7,%%mm1\n" 

961 
"movd %%mm1,%2\n" 

962 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 

963 
: "r" ((long)line_size) , "m" (h) 

964 
: "%ecx"); 

965 
return tmp; 

966 
} 

967  
968 
static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

969 
int tmp; 

970 
asm volatile ( 

971 
"shr $1,%2\n" 

972 
"pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */ 

973 
"pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */ 

974 
"1:\n" 

975 
"movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][015] */ 

976 
"movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][015] */ 

977 
"movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][015] */ 

978 
"movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][015] */ 

979  
980 
/* todo: mm1mm2, mm3mm4 */ 

981 
/* algo: subtract mm1 from mm2 with saturation and vice versa */ 

982 
/* OR the results to get absolute difference */ 

983 
"movdqa %%xmm1,%%xmm5\n" 

984 
"movdqa %%xmm3,%%xmm6\n" 

985 
"psubusb %%xmm2,%%xmm1\n" 

986 
"psubusb %%xmm4,%%xmm3\n" 

987 
"psubusb %%xmm5,%%xmm2\n" 

988 
"psubusb %%xmm6,%%xmm4\n" 

989  
990 
"por %%xmm1,%%xmm2\n" 

991 
"por %%xmm3,%%xmm4\n" 

992  
993 
/* now convert to 16bit vectors so we can square them */ 

994 
"movdqa %%xmm2,%%xmm1\n" 

995 
"movdqa %%xmm4,%%xmm3\n" 

996  
997 
"punpckhbw %%xmm0,%%xmm2\n" 

998 
"punpckhbw %%xmm0,%%xmm4\n" 

999 
"punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */ 

1000 
"punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */ 

1001  
1002 
"pmaddwd %%xmm2,%%xmm2\n" 

1003 
"pmaddwd %%xmm4,%%xmm4\n" 

1004 
"pmaddwd %%xmm1,%%xmm1\n" 

1005 
"pmaddwd %%xmm3,%%xmm3\n" 

1006  
1007 
"lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */ 

1008 
"lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */ 

1009  
1010 
"paddd %%xmm2,%%xmm1\n" 

1011 
"paddd %%xmm4,%%xmm3\n" 

1012 
"paddd %%xmm1,%%xmm7\n" 

1013 
"paddd %%xmm3,%%xmm7\n" 

1014  
1015 
"decl %2\n" 

1016 
"jnz 1b\n" 

1017  
1018 
"movdqa %%xmm7,%%xmm1\n" 

1019 
"psrldq $8, %%xmm7\n" /* shift hi qword to lo */ 

1020 
"paddd %%xmm1,%%xmm7\n" 

1021 
"movdqa %%xmm7,%%xmm1\n" 

1022 
"psrldq $4, %%xmm7\n" /* shift hi dword to lo */ 

1023 
"paddd %%xmm1,%%xmm7\n" 

1024 
"movd %%xmm7,%3\n" 

1025 
: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 

1026 
: "r" ((long)line_size)); 

1027 
return tmp; 

1028 
} 

1029  
1030 
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { 

1031 
int tmp; 

1032 
asm volatile ( 

1033 
"movl %3,%%ecx\n" 

1034 
"pxor %%mm7,%%mm7\n" 

1035 
"pxor %%mm6,%%mm6\n" 

1036  
1037 
"movq (%0),%%mm0\n" 

1038 
"movq %%mm0, %%mm1\n" 

1039 
"psllq $8, %%mm0\n" 

1040 
"psrlq $8, %%mm1\n" 

1041 
"psrlq $8, %%mm0\n" 

1042 
"movq %%mm0, %%mm2\n" 

1043 
"movq %%mm1, %%mm3\n" 

1044 
"punpcklbw %%mm7,%%mm0\n" 

1045 
"punpcklbw %%mm7,%%mm1\n" 

1046 
"punpckhbw %%mm7,%%mm2\n" 

1047 
"punpckhbw %%mm7,%%mm3\n" 

1048 
"psubw %%mm1, %%mm0\n" 

1049 
"psubw %%mm3, %%mm2\n" 

1050  
1051 
"add %2,%0\n" 

1052  
1053 
"movq (%0),%%mm4\n" 

1054 
"movq %%mm4, %%mm1\n" 

1055 
"psllq $8, %%mm4\n" 

1056 
"psrlq $8, %%mm1\n" 

1057 
"psrlq $8, %%mm4\n" 

1058 
"movq %%mm4, %%mm5\n" 

1059 
"movq %%mm1, %%mm3\n" 

1060 
"punpcklbw %%mm7,%%mm4\n" 

1061 
"punpcklbw %%mm7,%%mm1\n" 

1062 
"punpckhbw %%mm7,%%mm5\n" 

1063 
"punpckhbw %%mm7,%%mm3\n" 

1064 
"psubw %%mm1, %%mm4\n" 

1065 
"psubw %%mm3, %%mm5\n" 

1066 
"psubw %%mm4, %%mm0\n" 

1067 
"psubw %%mm5, %%mm2\n" 

1068 
"pxor %%mm3, %%mm3\n" 

1069 
"pxor %%mm1, %%mm1\n" 

1070 
"pcmpgtw %%mm0, %%mm3\n\t" 

1071 
"pcmpgtw %%mm2, %%mm1\n\t" 

1072 
"pxor %%mm3, %%mm0\n" 

1073 
"pxor %%mm1, %%mm2\n" 

1074 
"psubw %%mm3, %%mm0\n" 

1075 
"psubw %%mm1, %%mm2\n" 

1076 
"paddw %%mm0, %%mm2\n" 

1077 
"paddw %%mm2, %%mm6\n" 

1078  
1079 
"add %2,%0\n" 

1080 
"1:\n" 

1081  
1082 
"movq (%0),%%mm0\n" 

1083 
"movq %%mm0, %%mm1\n" 

1084 
"psllq $8, %%mm0\n" 

1085 
"psrlq $8, %%mm1\n" 

1086 
"psrlq $8, %%mm0\n" 

1087 
"movq %%mm0, %%mm2\n" 

1088 
"movq %%mm1, %%mm3\n" 

1089 
"punpcklbw %%mm7,%%mm0\n" 

1090 
"punpcklbw %%mm7,%%mm1\n" 

1091 
"punpckhbw %%mm7,%%mm2\n" 

1092 
"punpckhbw %%mm7,%%mm3\n" 

1093 
"psubw %%mm1, %%mm0\n" 

1094 
"psubw %%mm3, %%mm2\n" 

1095 
"psubw %%mm0, %%mm4\n" 

1096 
"psubw %%mm2, %%mm5\n" 

1097 
"pxor %%mm3, %%mm3\n" 

1098 
"pxor %%mm1, %%mm1\n" 

1099 
"pcmpgtw %%mm4, %%mm3\n\t" 

1100 
"pcmpgtw %%mm5, %%mm1\n\t" 

1101 
"pxor %%mm3, %%mm4\n" 

1102 
"pxor %%mm1, %%mm5\n" 

1103 
"psubw %%mm3, %%mm4\n" 

1104 
"psubw %%mm1, %%mm5\n" 

1105 
"paddw %%mm4, %%mm5\n" 

1106 
"paddw %%mm5, %%mm6\n" 

1107  
1108 
"add %2,%0\n" 

1109  
1110 
"movq (%0),%%mm4\n" 

1111 
"movq %%mm4, %%mm1\n" 

1112 
"psllq $8, %%mm4\n" 

1113 
"psrlq $8, %%mm1\n" 

1114 
"psrlq $8, %%mm4\n" 

1115 
"movq %%mm4, %%mm5\n" 

1116 
"movq %%mm1, %%mm3\n" 

1117 
"punpcklbw %%mm7,%%mm4\n" 

1118 
"punpcklbw %%mm7,%%mm1\n" 

1119 
"punpckhbw %%mm7,%%mm5\n" 

1120 
"punpckhbw %%mm7,%%mm3\n" 

1121 
"psubw %%mm1, %%mm4\n" 

1122 
"psubw %%mm3, %%mm5\n" 

1123 
"psubw %%mm4, %%mm0\n" 

1124 
"psubw %%mm5, %%mm2\n" 

1125 
"pxor %%mm3, %%mm3\n" 

1126 
"pxor %%mm1, %%mm1\n" 

1127 
"pcmpgtw %%mm0, %%mm3\n\t" 

1128 
"pcmpgtw %%mm2, %%mm1\n\t" 

1129 
"pxor %%mm3, %%mm0\n" 

1130 
"pxor %%mm1, %%mm2\n" 

1131 
"psubw %%mm3, %%mm0\n" 

1132 
"psubw %%mm1, %%mm2\n" 

1133 
"paddw %%mm0, %%mm2\n" 

1134 
"paddw %%mm2, %%mm6\n" 

1135  
1136 
"add %2,%0\n" 

1137 
"subl $2, %%ecx\n" 

1138 
" jnz 1b\n" 

1139  
1140 
"movq %%mm6, %%mm0\n" 

1141 
"punpcklwd %%mm7,%%mm0\n" 

1142 
"punpckhwd %%mm7,%%mm6\n" 

1143 
"paddd %%mm0, %%mm6\n" 

1144  
1145 
"movq %%mm6,%%mm0\n" 

1146 
"psrlq $32, %%mm6\n" 

1147 
"paddd %%mm6,%%mm0\n" 

1148 
"movd %%mm0,%1\n" 

1149 
: "+r" (pix1), "=r"(tmp) 

1150 
: "r" ((long)line_size) , "g" (h2) 

1151 
: "%ecx"); 

1152 
return tmp; 

1153 
} 

1154  
1155 
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { 

1156 
int tmp; 

1157 
uint8_t * pix= pix1; 

1158 
asm volatile ( 

1159 
"movl %3,%%ecx\n" 

1160 
"pxor %%mm7,%%mm7\n" 

1161 
"pxor %%mm6,%%mm6\n" 

1162  
1163 
"movq (%0),%%mm0\n" 

1164 
"movq 1(%0),%%mm1\n" 

1165 
"movq %%mm0, %%mm2\n" 

1166 
"movq %%mm1, %%mm3\n" 

1167 
"punpcklbw %%mm7,%%mm0\n" 

1168 
"punpcklbw %%mm7,%%mm1\n" 

1169 
"punpckhbw %%mm7,%%mm2\n" 

1170 
"punpckhbw %%mm7,%%mm3\n" 

1171 
"psubw %%mm1, %%mm0\n" 

1172 
"psubw %%mm3, %%mm2\n" 

1173  
1174 
"add %2,%0\n" 

1175  
1176 
"movq (%0),%%mm4\n" 

1177 
"movq 1(%0),%%mm1\n" 

1178 
"movq %%mm4, %%mm5\n" 

1179 
"movq %%mm1, %%mm3\n" 

1180 
"punpcklbw %%mm7,%%mm4\n" 

1181 
"punpcklbw %%mm7,%%mm1\n" 

1182 
"punpckhbw %%mm7,%%mm5\n" 

1183 
"punpckhbw %%mm7,%%mm3\n" 

1184 
"psubw %%mm1, %%mm4\n" 

1185 
"psubw %%mm3, %%mm5\n" 

1186 
"psubw %%mm4, %%mm0\n" 

1187 
"psubw %%mm5, %%mm2\n" 

1188 
"pxor %%mm3, %%mm3\n" 

1189 
"pxor %%mm1, %%mm1\n" 

1190 
"pcmpgtw %%mm0, %%mm3\n\t" 

1191 
"pcmpgtw %%mm2, %%mm1\n\t" 

1192 
"pxor %%mm3, %%mm0\n" 

1193 
"pxor %%mm1, %%mm2\n" 

1194 
"psubw %%mm3, %%mm0\n" 

1195 
"psubw %%mm1, %%mm2\n" 

1196 
"paddw %%mm0, %%mm2\n" 

1197 
"paddw %%mm2, %%mm6\n" 

1198  
1199 
"add %2,%0\n" 

1200 
"1:\n" 

1201  
1202 
"movq (%0),%%mm0\n" 

1203 
"movq 1(%0),%%mm1\n" 

1204 
"movq %%mm0, %%mm2\n" 

1205 
"movq %%mm1, %%mm3\n" 

1206 
"punpcklbw %%mm7,%%mm0\n" 

1207 
"punpcklbw %%mm7,%%mm1\n" 

1208 
"punpckhbw %%mm7,%%mm2\n" 

1209 
"punpckhbw %%mm7,%%mm3\n" 

1210 
"psubw %%mm1, %%mm0\n" 

1211 
"psubw %%mm3, %%mm2\n" 

1212 
"psubw %%mm0, %%mm4\n" 

1213 
"psubw %%mm2, %%mm5\n" 

1214 
"pxor %%mm3, %%mm3\n" 

1215 
"pxor %%mm1, %%mm1\n" 

1216 
"pcmpgtw %%mm4, %%mm3\n\t" 

1217 
"pcmpgtw %%mm5, %%mm1\n\t" 

1218 
"pxor %%mm3, %%mm4\n" 

1219 
"pxor %%mm1, %%mm5\n" 

1220 
"psubw %%mm3, %%mm4\n" 

1221 
"psubw %%mm1, %%mm5\n" 

1222 
"paddw %%mm4, %%mm5\n" 

1223 
"paddw %%mm5, %%mm6\n" 

1224  
1225 
"add %2,%0\n" 

1226  
1227 
"movq (%0),%%mm4\n" 

1228 
"movq 1(%0),%%mm1\n" 

1229 
"movq %%mm4, %%mm5\n" 

1230 
"movq %%mm1, %%mm3\n" 

1231 
"punpcklbw %%mm7,%%mm4\n" 

1232 
"punpcklbw %%mm7,%%mm1\n" 

1233 
"punpckhbw %%mm7,%%mm5\n" 

1234 
"punpckhbw %%mm7,%%mm3\n" 

1235 
"psubw %%mm1, %%mm4\n" 

1236 
"psubw %%mm3, %%mm5\n" 

1237 
"psubw %%mm4, %%mm0\n" 

1238 
"psubw %%mm5, %%mm2\n" 

1239 
"pxor %%mm3, %%mm3\n" 

1240 
"pxor %%mm1, %%mm1\n" 

1241 
"pcmpgtw %%mm0, %%mm3\n\t" 

1242 
"pcmpgtw %%mm2, %%mm1\n\t" 

1243 
"pxor %%mm3, %%mm0\n" 

1244 
"pxor %%mm1, %%mm2\n" 

1245 
"psubw %%mm3, %%mm0\n" 

1246 
"psubw %%mm1, %%mm2\n" 

1247 
"paddw %%mm0, %%mm2\n" 

1248 
"paddw %%mm2, %%mm6\n" 

1249  
1250 
"add %2,%0\n" 

1251 
"subl $2, %%ecx\n" 

1252 
" jnz 1b\n" 

1253  
1254 
"movq %%mm6, %%mm0\n" 

1255 
"punpcklwd %%mm7,%%mm0\n" 

1256 
"punpckhwd %%mm7,%%mm6\n" 

1257 
"paddd %%mm0, %%mm6\n" 

1258  
1259 
"movq %%mm6,%%mm0\n" 

1260 
"psrlq $32, %%mm6\n" 

1261 
"paddd %%mm6,%%mm0\n" 

1262 
"movd %%mm0,%1\n" 

1263 
: "+r" (pix1), "=r"(tmp) 

1264 
: "r" ((long)line_size) , "g" (h2) 

1265 
: "%ecx"); 

1266 
return tmp + hf_noise8_mmx(pix+8, line_size, h); 

1267 
} 

1268  
1269 
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

1270 
MpegEncContext *c = p; 

1271 
int score1, score2; 

1272  
1273 
if(c) score1 = c>dsp.sse[0](c, pix1, pix2, line_size, h); 

1274 
else score1 = sse16_mmx(c, pix1, pix2, line_size, h); 

1275 
score2= hf_noise16_mmx(pix1, line_size, h)  hf_noise16_mmx(pix2, line_size, h); 

1276  
1277 
if(c) return score1 + FFABS(score2)*c>avctx>nsse_weight; 

1278 
else return score1 + FFABS(score2)*8; 

1279 
} 

1280  
1281 
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

1282 
MpegEncContext *c = p; 

1283 
int score1= sse8_mmx(c, pix1, pix2, line_size, h); 

1284 
int score2= hf_noise8_mmx(pix1, line_size, h)  hf_noise8_mmx(pix2, line_size, h); 

1285  
1286 
if(c) return score1 + FFABS(score2)*c>avctx>nsse_weight; 

1287 
else return score1 + FFABS(score2)*8; 

1288 
} 

1289  
1290 
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 

1291 
int tmp; 

1292  
1293 
assert( (((int)pix) & 7) == 0); 

1294 
assert((line_size &7) ==0); 

1295  
1296 
#define SUM(in0, in1, out0, out1) \ 

1297 
"movq (%0), %%mm2\n"\ 

1298 
"movq 8(%0), %%mm3\n"\ 

1299 
"add %2,%0\n"\ 

1300 
"movq %%mm2, " #out0 "\n"\ 

1301 
"movq %%mm3, " #out1 "\n"\ 

1302 
"psubusb " #in0 ", %%mm2\n"\ 

1303 
"psubusb " #in1 ", %%mm3\n"\ 

1304 
"psubusb " #out0 ", " #in0 "\n"\ 

1305 
"psubusb " #out1 ", " #in1 "\n"\ 

1306 
"por %%mm2, " #in0 "\n"\ 

1307 
"por %%mm3, " #in1 "\n"\ 

1308 
"movq " #in0 ", %%mm2\n"\ 

1309 
"movq " #in1 ", %%mm3\n"\ 

1310 
"punpcklbw %%mm7, " #in0 "\n"\ 

1311 
"punpcklbw %%mm7, " #in1 "\n"\ 

1312 
"punpckhbw %%mm7, %%mm2\n"\ 

1313 
"punpckhbw %%mm7, %%mm3\n"\ 

1314 
"paddw " #in1 ", " #in0 "\n"\ 

1315 
"paddw %%mm3, %%mm2\n"\ 

1316 
"paddw %%mm2, " #in0 "\n"\ 

1317 
"paddw " #in0 ", %%mm6\n" 

1318  
1319  
1320 
asm volatile ( 

1321 
"movl %3,%%ecx\n" 

1322 
"pxor %%mm6,%%mm6\n" 

1323 
"pxor %%mm7,%%mm7\n" 

1324 
"movq (%0),%%mm0\n" 

1325 
"movq 8(%0),%%mm1\n" 

1326 
"add %2,%0\n" 

1327 
"subl $2, %%ecx\n" 

1328 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1329 
"1:\n" 

1330  
1331 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 

1332  
1333 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1334  
1335 
"subl $2, %%ecx\n" 

1336 
"jnz 1b\n" 

1337  
1338 
"movq %%mm6,%%mm0\n" 

1339 
"psrlq $32, %%mm6\n" 

1340 
"paddw %%mm6,%%mm0\n" 

1341 
"movq %%mm0,%%mm6\n" 

1342 
"psrlq $16, %%mm0\n" 

1343 
"paddw %%mm6,%%mm0\n" 

1344 
"movd %%mm0,%1\n" 

1345 
: "+r" (pix), "=r"(tmp) 

1346 
: "r" ((long)line_size) , "m" (h) 

1347 
: "%ecx"); 

1348 
return tmp & 0xFFFF; 

1349 
} 

1350 
#undef SUM 

1351  
1352 
static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { 

1353 
int tmp; 

1354  
1355 
assert( (((int)pix) & 7) == 0); 

1356 
assert((line_size &7) ==0); 

1357  
1358 
#define SUM(in0, in1, out0, out1) \ 

1359 
"movq (%0), " #out0 "\n"\ 

1360 
"movq 8(%0), " #out1 "\n"\ 

1361 
"add %2,%0\n"\ 

1362 
"psadbw " #out0 ", " #in0 "\n"\ 

1363 
"psadbw " #out1 ", " #in1 "\n"\ 

1364 
"paddw " #in1 ", " #in0 "\n"\ 

1365 
"paddw " #in0 ", %%mm6\n" 

1366  
1367 
asm volatile ( 

1368 
"movl %3,%%ecx\n" 

1369 
"pxor %%mm6,%%mm6\n" 

1370 
"pxor %%mm7,%%mm7\n" 

1371 
"movq (%0),%%mm0\n" 

1372 
"movq 8(%0),%%mm1\n" 

1373 
"add %2,%0\n" 

1374 
"subl $2, %%ecx\n" 

1375 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1376 
"1:\n" 

1377  
1378 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 

1379  
1380 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1381  
1382 
"subl $2, %%ecx\n" 

1383 
"jnz 1b\n" 

1384  
1385 
"movd %%mm6,%1\n" 

1386 
: "+r" (pix), "=r"(tmp) 

1387 
: "r" ((long)line_size) , "m" (h) 

1388 
: "%ecx"); 

1389 
return tmp; 

1390 
} 

1391 
#undef SUM 

1392  
1393 
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

1394 
int tmp; 

1395  
1396 
assert( (((int)pix1) & 7) == 0); 

1397 
assert( (((int)pix2) & 7) == 0); 

1398 
assert((line_size &7) ==0); 

1399  
1400 
#define SUM(in0, in1, out0, out1) \ 

1401 
"movq (%0),%%mm2\n"\ 

1402 
"movq (%1)," #out0 "\n"\ 

1403 
"movq 8(%0),%%mm3\n"\ 

1404 
"movq 8(%1)," #out1 "\n"\ 

1405 
"add %3,%0\n"\ 

1406 
"add %3,%1\n"\ 

1407 
"psubb " #out0 ", %%mm2\n"\ 

1408 
"psubb " #out1 ", %%mm3\n"\ 

1409 
"pxor %%mm7, %%mm2\n"\ 

1410 
"pxor %%mm7, %%mm3\n"\ 

1411 
"movq %%mm2, " #out0 "\n"\ 

1412 
"movq %%mm3, " #out1 "\n"\ 

1413 
"psubusb " #in0 ", %%mm2\n"\ 

1414 
"psubusb " #in1 ", %%mm3\n"\ 

1415 
"psubusb " #out0 ", " #in0 "\n"\ 

1416 
"psubusb " #out1 ", " #in1 "\n"\ 

1417 
"por %%mm2, " #in0 "\n"\ 

1418 
"por %%mm3, " #in1 "\n"\ 

1419 
"movq " #in0 ", %%mm2\n"\ 

1420 
"movq " #in1 ", %%mm3\n"\ 

1421 
"punpcklbw %%mm7, " #in0 "\n"\ 

1422 
"punpcklbw %%mm7, " #in1 "\n"\ 

1423 
"punpckhbw %%mm7, %%mm2\n"\ 

1424 
"punpckhbw %%mm7, %%mm3\n"\ 

1425 
"paddw " #in1 ", " #in0 "\n"\ 

1426 
"paddw %%mm3, %%mm2\n"\ 

1427 
"paddw %%mm2, " #in0 "\n"\ 

1428 
"paddw " #in0 ", %%mm6\n" 

1429  
1430  
1431 
asm volatile ( 

1432 
"movl %4,%%ecx\n" 

1433 
"pxor %%mm6,%%mm6\n" 

1434 
"pcmpeqw %%mm7,%%mm7\n" 

1435 
"psllw $15, %%mm7\n" 

1436 
"packsswb %%mm7, %%mm7\n" 

1437 
"movq (%0),%%mm0\n" 

1438 
"movq (%1),%%mm2\n" 

1439 
"movq 8(%0),%%mm1\n" 

1440 
"movq 8(%1),%%mm3\n" 

1441 
"add %3,%0\n" 

1442 
"add %3,%1\n" 

1443 
"subl $2, %%ecx\n" 

1444 
"psubb %%mm2, %%mm0\n" 

1445 
"psubb %%mm3, %%mm1\n" 

1446 
"pxor %%mm7, %%mm0\n" 

1447 
"pxor %%mm7, %%mm1\n" 

1448 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1449 
"1:\n" 

1450  
1451 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 

1452  
1453 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1454  
1455 
"subl $2, %%ecx\n" 

1456 
"jnz 1b\n" 

1457  
1458 
"movq %%mm6,%%mm0\n" 

1459 
"psrlq $32, %%mm6\n" 

1460 
"paddw %%mm6,%%mm0\n" 

1461 
"movq %%mm0,%%mm6\n" 

1462 
"psrlq $16, %%mm0\n" 

1463 
"paddw %%mm6,%%mm0\n" 

1464 
"movd %%mm0,%2\n" 

1465 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 

1466 
: "r" ((long)line_size) , "m" (h) 

1467 
: "%ecx"); 

1468 
return tmp & 0x7FFF; 

1469 
} 

1470 
#undef SUM 

1471  
1472 
static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { 

1473 
int tmp; 

1474  
1475 
assert( (((int)pix1) & 7) == 0); 

1476 
assert( (((int)pix2) & 7) == 0); 

1477 
assert((line_size &7) ==0); 

1478  
1479 
#define SUM(in0, in1, out0, out1) \ 

1480 
"movq (%0)," #out0 "\n"\ 

1481 
"movq (%1),%%mm2\n"\ 

1482 
"movq 8(%0)," #out1 "\n"\ 

1483 
"movq 8(%1),%%mm3\n"\ 

1484 
"add %3,%0\n"\ 

1485 
"add %3,%1\n"\ 

1486 
"psubb %%mm2, " #out0 "\n"\ 

1487 
"psubb %%mm3, " #out1 "\n"\ 

1488 
"pxor %%mm7, " #out0 "\n"\ 

1489 
"pxor %%mm7, " #out1 "\n"\ 

1490 
"psadbw " #out0 ", " #in0 "\n"\ 

1491 
"psadbw " #out1 ", " #in1 "\n"\ 

1492 
"paddw " #in1 ", " #in0 "\n"\ 

1493 
"paddw " #in0 ", %%mm6\n" 

1494  
1495 
asm volatile ( 

1496 
"movl %4,%%ecx\n" 

1497 
"pxor %%mm6,%%mm6\n" 

1498 
"pcmpeqw %%mm7,%%mm7\n" 

1499 
"psllw $15, %%mm7\n" 

1500 
"packsswb %%mm7, %%mm7\n" 

1501 
"movq (%0),%%mm0\n" 

1502 
"movq (%1),%%mm2\n" 

1503 
"movq 8(%0),%%mm1\n" 

1504 
"movq 8(%1),%%mm3\n" 

1505 
"add %3,%0\n" 

1506 
"add %3,%1\n" 

1507 
"subl $2, %%ecx\n" 

1508 
"psubb %%mm2, %%mm0\n" 

1509 
"psubb %%mm3, %%mm1\n" 

1510 
"pxor %%mm7, %%mm0\n" 

1511 
"pxor %%mm7, %%mm1\n" 

1512 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1513 
"1:\n" 

1514  
1515 
SUM(%%mm4, %%mm5, %%mm0, %%mm1) 

1516  
1517 
SUM(%%mm0, %%mm1, %%mm4, %%mm5) 

1518  
1519 
"subl $2, %%ecx\n" 

1520 
"jnz 1b\n" 

1521  
1522 
"movd %%mm6,%2\n" 

1523 
: "+r" (pix1), "+r" (pix2), "=r"(tmp) 

1524 
: "r" ((long)line_size) , "m" (h) 

1525 
: "%ecx"); 

1526 
return tmp; 

1527 
} 

1528 
#undef SUM 

1529  
1530 
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ 

1531 
long i=0; 

1532 
asm volatile( 

1533 
"1: \n\t" 

1534 
"movq (%2, %0), %%mm0 \n\t" 

1535 
"movq (%1, %0), %%mm1 \n\t" 

1536 
"psubb %%mm0, %%mm1 \n\t" 

1537 
"movq %%mm1, (%3, %0) \n\t" 

1538 
"movq 8(%2, %0), %%mm0 \n\t" 

1539 
"movq 8(%1, %0), %%mm1 \n\t" 

1540 
"psubb %%mm0, %%mm1 \n\t" 

1541 
"movq %%mm1, 8(%3, %0) \n\t" 

1542 
"add $16, %0 \n\t" 

1543 
"cmp %4, %0 \n\t" 

1544 
" jb 1b \n\t" 

1545 
: "+r" (i) 

1546 
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w15) 

1547 
); 

1548 
for(; i<w; i++) 

1549 
dst[i+0] = src1[i+0]src2[i+0]; 

1550 
} 

1551  
1552 
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ 

1553 
long i=0; 

1554 
uint8_t l, lt; 

1555  
1556 
asm volatile( 

1557 
"1: \n\t" 

1558 
"movq 1(%1, %0), %%mm0 \n\t" // LT 

1559 
"movq (%1, %0), %%mm1 \n\t" // T 

1560 
"movq 1(%2, %0), %%mm2 \n\t" // L 

1561 
"movq (%2, %0), %%mm3 \n\t" // X 

1562 
"movq %%mm2, %%mm4 \n\t" // L 

1563 
"psubb %%mm0, %%mm2 \n\t" 

1564 
"paddb %%mm1, %%mm2 \n\t" // L + T  LT 

1565 
"movq %%mm4, %%mm5 \n\t" // L 

1566 
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L) 

1567 
"pminub %%mm5, %%mm1 \n\t" // min(T, L) 

1568 
"pminub %%mm2, %%mm4 \n\t" 

1569 
"pmaxub %%mm1, %%mm4 \n\t" 

1570 
"psubb %%mm4, %%mm3 \n\t" // dst  pred 

1571 
"movq %%mm3, (%3, %0) \n\t" 

1572 
"add $8, %0 \n\t" 

1573 
"cmp %4, %0 \n\t" 

1574 
" jb 1b \n\t" 

1575 
: "+r" (i) 

1576 
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w) 

1577 
); 

1578  
1579 
l= *left; 

1580 
lt= *left_top; 

1581  
1582 
dst[0]= src2[0]  mid_pred(l, src1[0], (l + src1[0]  lt)&0xFF); 

1583  
1584 
*left_top= src1[w1]; 

1585 
*left = src2[w1]; 

1586 
} 

1587  
1588  699 
#define PAETH(cpu, abs3)\ 
1589  700 
void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\ 
1590  701 
{\ 
...  ...  
1659  770 
PAETH(ssse3, ABS3_SSSE3) 
1660  771 
#endif 
1661  772  
1662 
#define DIFF_PIXELS_1(m,a,t,p1,p2)\ 

1663 
"mov"#m" "#p1", "#a" \n\t"\ 

1664 
"mov"#m" "#p2", "#t" \n\t"\ 

1665 
"punpcklbw "#a", "#t" \n\t"\ 

1666 
"punpcklbw "#a", "#a" \n\t"\ 

1667 
"psubw "#t", "#a" \n\t"\ 

1668  
1669 
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\ 

1670 
uint8_t *p1b=p1, *p2b=p2;\ 

1671 
asm volatile(\ 

1672 
DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\ 

1673 
DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\ 

1674 
DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\ 

1675 
"add %4, %1 \n\t"\ 

1676 
"add %4, %2 \n\t"\ 

1677 
DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\ 

1678 
DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\ 

1679 
DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\ 

1680 
DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\ 

1681 
"mov"#m1" "#mm"0, %0 \n\t"\ 

1682 
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\ 

1683 
"mov"#m1" %0, "#mm"0 \n\t"\ 

1684 
: "+m"(temp), "+r"(p1b), "+r"(p2b)\ 

1685 
: "r"((long)stride), "r"((long)stride*3)\ 

1686 
);\ 

1687 
} 

1688 
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp) 

1689  
1690 
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp) 

1691 
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp) 

1692  
1693 
#define LBUTTERFLY2(a1,b1,a2,b2)\ 

1694 
"paddw " #b1 ", " #a1 " \n\t"\ 

1695 
"paddw " #b2 ", " #a2 " \n\t"\ 

1696 
"paddw " #b1 ", " #b1 " \n\t"\ 

1697 
"paddw " #b2 ", " #b2 " \n\t"\ 

1698 
"psubw " #a1 ", " #b1 " \n\t"\ 

1699 
"psubw " #a2 ", " #b2 " \n\t" 

1700  
1701 
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ 

1702 
LBUTTERFLY2(m0, m1, m2, m3)\ 

1703 
LBUTTERFLY2(m4, m5, m6, m7)\ 

1704 
LBUTTERFLY2(m0, m2, m1, m3)\ 

1705 
LBUTTERFLY2(m4, m6, m5, m7)\ 

1706 
LBUTTERFLY2(m0, m4, m1, m5)\ 

1707 
LBUTTERFLY2(m2, m6, m3, m7)\ 

1708  
1709 
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) 

1710  
1711 
#define MMABS_MMX(a,z)\ 

1712 
"pxor " #z ", " #z " \n\t"\ 

1713 
"pcmpgtw " #a ", " #z " \n\t"\ 

1714 
"pxor " #z ", " #a " \n\t"\ 

1715 
"psubw " #z ", " #a " \n\t" 

1716  
1717 
#define MMABS_MMX2(a,z)\ 

1718 
"pxor " #z ", " #z " \n\t"\ 

1719 
"psubw " #a ", " #z " \n\t"\ 

1720 
"pmaxsw " #z ", " #a " \n\t" 

1721  
1722 
#define MMABS_SSSE3(a,z)\ 

1723 
"pabsw " #a ", " #a " \n\t" 

1724  
1725 
#define MMABS_SUM(a,z, sum)\ 

1726 
MMABS(a,z)\ 

1727 
"paddusw " #a ", " #sum " \n\t" 

1728  
1729 
#define MMABS_SUM_8x8_NOSPILL\ 

1730 
MMABS(%%xmm0, %%xmm8)\ 

1731 
MMABS(%%xmm1, %%xmm9)\ 

1732 
MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ 

1733 
MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ 

1734 
MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ 

1735 
MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ 

1736 
MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ 

1737 
MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ 

1738 
"paddusw %%xmm1, %%xmm0 \n\t" 

1739  
1740 
#ifdef ARCH_X86_64 

1741 
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL 

1742 
#else 

1743 
#define MMABS_SUM_8x8_SSE2\ 

1744 
"movdqa %%xmm7, (%1) \n\t"\ 

1745 
MMABS(%%xmm0, %%xmm7)\ 

1746 
MMABS(%%xmm1, %%xmm7)\ 

1747 
MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ 

1748 
MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ 

1749 
MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ 

1750 
MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ 

1751 
MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ 

1752 
"movdqa (%1), %%xmm2 \n\t"\ 

1753 
MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ 

1754 
"paddusw %%xmm1, %%xmm0 \n\t" 

1755 
#endif 

1756  
1757 
#define LOAD4(o, a, b, c, d)\ 

1758 
"movq "#o"(%1), "#a" \n\t"\ 

1759 
"movq "#o"+8(%1), "#b" \n\t"\ 

1760 
"movq "#o"+16(%1), "#c" \n\t"\ 

1761 
"movq "#o"+24(%1), "#d" \n\t"\ 

1762  
1763 
#define STORE4(o, a, b, c, d)\ 

1764 
"movq "#a", "#o"(%1) \n\t"\ 

1765 
"movq "#b", "#o"+8(%1) \n\t"\ 

1766 
"movq "#c", "#o"+16(%1) \n\t"\ 

1767 
"movq "#d", "#o"+24(%1) \n\t"\ 

1768  
1769 
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to 

1770 
* about 100k on extreme inputs. But that's very unlikely to occur in natural video, 

1771 
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ 

1772 
#define HSUM_MMX(a, t, dst)\ 

1773 
"movq "#a", "#t" \n\t"\ 

1774 
"psrlq $32, "#a" \n\t"\ 

1775 
"paddusw "#t", "#a" \n\t"\ 

1776 
"movq "#a", "#t" \n\t"\ 

1777 
"psrlq $16, "#a" \n\t"\ 

1778 
"paddusw "#t", "#a" \n\t"\ 

1779 
"movd "#a", "#dst" \n\t"\ 

1780  
1781 
#define HSUM_MMX2(a, t, dst)\ 

1782 
"pshufw $0x0E, "#a", "#t" \n\t"\ 

1783 
"paddusw "#t", "#a" \n\t"\ 

1784 
"pshufw $0x01, "#a", "#t" \n\t"\ 

1785 
"paddusw "#t", "#a" \n\t"\ 

1786 
"movd "#a", "#dst" \n\t"\ 

1787  
1788 
#define HSUM_SSE2(a, t, dst)\ 

1789 
"movhlps "#a", "#t" \n\t"\ 

1790 
"paddusw "#t", "#a" \n\t"\ 

1791 
"pshuflw $0x0E, "#a", "#t" \n\t"\ 

1792 
"paddusw "#t", "#a" \n\t"\ 

1793 
"pshuflw $0x01, "#a", "#t" \n\t"\ 

1794 
"paddusw "#t", "#a" \n\t"\ 

1795 
"movd "#a", "#dst" \n\t"\ 

1796  
1797 
#define HADAMARD8_DIFF_MMX(cpu) \ 

1798 
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 

1799 
DECLARE_ALIGNED_8(uint64_t, temp[13]);\ 

1800 
int sum;\ 

1801 
\ 

1802 
assert(h==8);\ 

1803 
\ 

1804 
DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ 

1805 
\ 

1806 
asm volatile(\ 

1807 
HADAMARD48\ 

1808 
\ 

1809 
"movq %%mm7, 96(%1) \n\t"\ 

1810 
\ 

1811 
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 

1812 
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ 

1813 
\ 

1814 
"movq 96(%1), %%mm7 \n\t"\ 

1815 
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 

1816 
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ 

1817 
\ 

1818 
: "=r" (sum)\ 

1819 
: "r"(temp)\ 

1820 
);\ 

1821 
\ 

1822 
DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ 

1823 
\ 

1824 
asm volatile(\ 

1825 
HADAMARD48\ 

1826 
\ 

1827 
"movq %%mm7, 96(%1) \n\t"\ 

1828 
\ 

1829 
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ 

1830 
STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ 

1831 
\ 

1832 
"movq 96(%1), %%mm7 \n\t"\ 

1833 
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ 

1834 
"movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ 

1835 
"movq %%mm6, %%mm7 \n\t"\ 

1836 
"movq %%mm0, %%mm6 \n\t"\ 

1837 
\ 

1838 
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ 

1839 
\ 

1840 
HADAMARD48\ 

1841 
"movq %%mm7, 64(%1) \n\t"\ 

1842 
MMABS(%%mm0, %%mm7)\ 

1843 
MMABS(%%mm1, %%mm7)\ 

1844 
MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 

1845 
MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 

1846 
MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 

1847 
MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 

1848 
MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 

1849 
"movq 64(%1), %%mm2 \n\t"\ 

1850 
MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 

1851 
"paddusw %%mm1, %%mm0 \n\t"\ 

1852 
"movq %%mm0, 64(%1) \n\t"\ 

1853 
\ 

1854 
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ 

1855 
LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ 

1856 
\ 

1857 
HADAMARD48\ 

1858 
"movq %%mm7, (%1) \n\t"\ 

1859 
MMABS(%%mm0, %%mm7)\ 

1860 
MMABS(%%mm1, %%mm7)\ 

1861 
MMABS_SUM(%%mm2, %%mm7, %%mm0)\ 

1862 
MMABS_SUM(%%mm3, %%mm7, %%mm1)\ 

1863 
MMABS_SUM(%%mm4, %%mm7, %%mm0)\ 

1864 
MMABS_SUM(%%mm5, %%mm7, %%mm1)\ 

1865 
MMABS_SUM(%%mm6, %%mm7, %%mm0)\ 

1866 
"movq (%1), %%mm2 \n\t"\ 

1867 
MMABS_SUM(%%mm2, %%mm7, %%mm1)\ 

1868 
"paddusw 64(%1), %%mm0 \n\t"\ 

1869 
"paddusw %%mm1, %%mm0 \n\t"\ 

1870 
\ 

1871 
HSUM(%%mm0, %%mm1, %0)\ 

1872 
\ 

1873 
: "=r" (sum)\ 

1874 
: "r"(temp)\ 

1875 
);\ 

1876 
return sum&0xFFFF;\ 

1877 
}\ 

1878 
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 

1879  
1880 
#define HADAMARD8_DIFF_SSE2(cpu) \ 

1881 
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ 

1882 
DECLARE_ALIGNED_16(uint64_t, temp[4]);\ 

1883 
int sum;\ 

1884 
\ 

1885 
assert(h==8);\ 

1886 
\ 

1887 
DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ 

1888 
\ 

1889 
asm volatile(\ 

1890 
HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ 

1891 
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ 

1892 
HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ 

1893 
MMABS_SUM_8x8\ 

1894 
HSUM_SSE2(%%xmm0, %%xmm1, %0)\ 

1895 
: "=r" (sum)\ 

1896 
: "r"(temp)\ 

1897 
);\ 

1898 
return sum&0xFFFF;\ 

1899 
}\ 

1900 
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) 

1901  
1902 
#define MMABS(a,z) MMABS_MMX(a,z) 

1903 
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 

1904 
HADAMARD8_DIFF_MMX(mmx) 

1905 
#undef MMABS 

1906 
#undef HSUM 

1907  
1908 
#define MMABS(a,z) MMABS_MMX2(a,z) 

1909 
#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 

1910 
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 

1911 
HADAMARD8_DIFF_MMX(mmx2) 

1912 
HADAMARD8_DIFF_SSE2(sse2) 

1913 
#undef MMABS 

1914 
#undef MMABS_SUM_8x8 

1915 
#undef HSUM 

1916  
1917 
#ifdef HAVE_SSSE3 

1918 
#define MMABS(a,z) MMABS_SSSE3(a,z) 

1919 
#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL 

1920 
HADAMARD8_DIFF_SSE2(ssse3) 

1921 
#undef MMABS 

1922 
#undef MMABS_SUM_8x8 

1923 
#endif 

1924  
1925 
#define DCT_SAD4(m,mm,o)\ 

1926 
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ 

1927 
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ 

1928 
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ 

1929 
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ 

1930 
MMABS_SUM(mm##2, mm##6, mm##0)\ 

1931 
MMABS_SUM(mm##3, mm##7, mm##1)\ 

1932 
MMABS_SUM(mm##4, mm##6, mm##0)\ 

1933 
MMABS_SUM(mm##5, mm##7, mm##1)\ 

1934  
1935 
#define DCT_SAD_MMX\ 

1936 
"pxor %%mm0, %%mm0 \n\t"\ 

1937 
"pxor %%mm1, %%mm1 \n\t"\ 

1938 
DCT_SAD4(q, %%mm, 0)\ 

1939 
DCT_SAD4(q, %%mm, 8)\ 

1940 
DCT_SAD4(q, %%mm, 64)\ 

1941 
DCT_SAD4(q, %%mm, 72)\ 

1942 
"paddusw %%mm1, %%mm0 \n\t"\ 

1943 
HSUM(%%mm0, %%mm1, %0) 

1944  
1945 
#define DCT_SAD_SSE2\ 

1946 
"pxor %%xmm0, %%xmm0 \n\t"\ 

1947 
"pxor %%xmm1, %%xmm1 \n\t"\ 

1948 
DCT_SAD4(dqa, %%xmm, 0)\ 

1949 
DCT_SAD4(dqa, %%xmm, 64)\ 

1950 
"paddusw %%xmm1, %%xmm0 \n\t"\ 

1951 
HSUM(%%xmm0, %%xmm1, %0) 

1952  
1953 
#define DCT_SAD_FUNC(cpu) \ 

1954 
static int sum_abs_dctelem_##cpu(DCTELEM *block){\ 

1955 
int sum;\ 

1956 
asm volatile(\ 

1957 
DCT_SAD\ 

1958 
:"=r"(sum)\ 

1959 
:"r"(block)\ 

1960 
);\ 

1961 
return sum&0xFFFF;\ 

1962 
} 

1963  
1964 
#define DCT_SAD DCT_SAD_MMX 

1965 
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) 

1966 
#define MMABS(a,z) MMABS_MMX(a,z) 

1967 
DCT_SAD_FUNC(mmx) 

1968 
#undef MMABS 

1969 
#undef HSUM 

1970  
1971 
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) 

1972 
#define MMABS(a,z) MMABS_MMX2(a,z) 

1973 
DCT_SAD_FUNC(mmx2) 

1974 
#undef HSUM 

1975 
#undef DCT_SAD 

1976  
1977 
#define DCT_SAD DCT_SAD_SSE2 

1978 
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) 

1979 
DCT_SAD_FUNC(sse2) 

1980 
#undef MMABS 

1981  
1982 
#ifdef HAVE_SSSE3 

1983 
#define MMABS(a,z) MMABS_SSSE3(a,z) 

1984 
DCT_SAD_FUNC(ssse3) 

1985 
#undef MMABS 

1986 
#endif 

1987 
#undef HSUM 

1988 
#undef DCT_SAD 

1989  
1990 
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ 

1991 
int sum; 

1992 
long i=size; 

1993 
asm volatile( 

1994 
"pxor %%mm4, %%mm4 \n" 

1995 
"1: \n" 

1996 
"sub $8, %0 \n" 

1997 
"movq (%2,%0), %%mm2 \n" 

1998 
"movq (%3,%0,2), %%mm0 \n" 

1999 
"movq 8(%3,%0,2), %%mm1 \n" 

2000 
"punpckhbw %%mm2, %%mm3 \n" 

2001 
"punpcklbw %%mm2, %%mm2 \n" 

2002 
"psraw $8, %%mm3 \n" 

2003 
"psraw $8, %%mm2 \n" 

2004 
"psubw %%mm3, %%mm1 \n" 

2005 
"psubw %%mm2, %%mm0 \n" 

2006 
"pmaddwd %%mm1, %%mm1 \n" 

2007 
"pmaddwd %%mm0, %%mm0 \n" 

2008 
"paddd %%mm1, %%mm4 \n" 

2009 
"paddd %%mm0, %%mm4 \n" 

2010 
"jg 1b \n" 

2011 
"movq %%mm4, %%mm3 \n" 

2012 
"psrlq $32, %%mm3 \n" 

2013 
"paddd %%mm3, %%mm4 \n" 

2014 
"movd %%mm4, %1 \n" 

2015 
:"+r"(i), "=r"(sum) 

2016 
:"r"(pix1), "r"(pix2) 

2017 
); 

2018 
return sum; 

2019 
} 

2020  
2021 
#endif //CONFIG_ENCODERS 

2022  
2023  773 
#define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ 
2024  774 
"paddw " #m4 ", " #m3 " \n\t" /* x1 */\ 
2025  775 
"movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ 
...  ...  
2858  1608 
} 
2859  1609 
} 
2860  1610  
2861 
#ifdef CONFIG_ENCODERS 

2862  
2863 
#define PHADDD(a, t)\ 

2864 
"movq "#a", "#t" \n\t"\ 

2865 
"psrlq $32, "#a" \n\t"\ 

2866 
"paddd "#t", "#a" \n\t" 

2867 
/* 

2868 
pmulhw: dst[015]=(src[015]*dst[015])[1631] 

2869 
pmulhrw: dst[015]=(src[015]*dst[015] + 0x8000)[1631] 

2870 
pmulhrsw: dst[015]=(src[015]*dst[015] + 0x4000)[1530] 

2871 
*/ 

2872 
#define PMULHRW(x, y, s, o)\ 

2873 
"pmulhw " #s ", "#x " \n\t"\ 

2874 
"pmulhw " #s ", "#y " \n\t"\ 

2875 
"paddw " #o ", "#x " \n\t"\ 

2876 
"paddw " #o ", "#y " \n\t"\ 

2877 
"psraw $1, "#x " \n\t"\ 

2878 
"psraw $1, "#y " \n\t" 

2879 
#define DEF(x) x ## _mmx 

2880 
#define SET_RND MOVQ_WONE 

2881 
#define SCALE_OFFSET 1 

2882  
2883 
#include "dsputil_mmx_qns.h" 

2884  
2885 
#undef DEF 

2886 
#undef SET_RND 

2887 
#undef SCALE_OFFSET 

2888 
#undef PMULHRW 

2889  
2890 
#define DEF(x) x ## _3dnow 

2891 
#define SET_RND(x) 

2892 
#define SCALE_OFFSET 0 

2893 
#define PMULHRW(x, y, s, o)\ 

2894 
"pmulhrw " #s ", "#x " \n\t"\ 

2895 
"pmulhrw " #s ", "#y " \n\t" 

2896  
2897 
#include "dsputil_mmx_qns.h" 

2898  
2899 
#undef DEF 

2900 
#undef SET_RND 

2901 
#undef SCALE_OFFSET 

2902 
#undef PMULHRW 

2903  
2904 
#ifdef HAVE_SSSE3 

2905 
#undef PHADDD 

2906 
#define DEF(x) x ## _ssse3 

2907 
#define SET_RND(x) 

2908 
#define SCALE_OFFSET 1 

2909 
#define PHADDD(a, t)\ 

2910 
"pshufw $0x0E, "#a", "#t" \n\t"\ 

2911 
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ 

2912 
#define PMULHRW(x, y, s, o)\ 

2913 
"pmulhrsw " #s ", "#x " \n\t"\ 

2914 
"pmulhrsw " #s ", "#y " \n\t" 

2915  
2916 
#include "dsputil_mmx_qns.h" 

2917  
2918 
#undef DEF 

2919 
#undef SET_RND 

2920 
#undef SCALE_OFFSET 

2921 
#undef PMULHRW 

2922 
#undef PHADDD 

2923 
#endif //HAVE_SSSE3 

2924  
2925 
#endif /* CONFIG_ENCODERS */ 

2926  
2927  1611 
#define PREFETCH(name, op) \ 
2928  1612 
static void name(void *mem, int stride, int h){\ 
2929  1613 
const uint8_t *p= mem;\ 
...  ...  
2954  1638 
avg_pixels16_mmx(dst, src, stride, 16); 
2955  1639 
} 
2956  1640  
2957 
/* FLAC specific */ 

2958 
void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, 

2959 
double *autoc); 

2960  
2961  1641 
/* VC1 specific */ 
2962  1642 
void ff_vc1dsp_init_mmx(DSPContext* dsp, AVCodecContext *avctx); 
2963  1643  
...  ...  
3320  2000 
if (mm_flags & MM_MMX) { 
3321  2001 
const int idct_algo= avctx>idct_algo; 
3322  2002  
3323 
#ifdef CONFIG_ENCODERS 

3324 
const int dct_algo = avctx>dct_algo; 

3325 
if(dct_algo==FF_DCT_AUTO  dct_algo==FF_DCT_MMX){ 

3326 
if(mm_flags & MM_SSE2){ 

3327 
c>fdct = ff_fdct_sse2; 

3328 
}else if(mm_flags & MM_MMXEXT){ 

3329 
c>fdct = ff_fdct_mmx2; 

3330 
}else{ 

3331 
c>fdct = ff_fdct_mmx; 

3332 
} 

3333 
} 

3334 
#endif //CONFIG_ENCODERS 

3335  2003 
if(avctx>lowres==0){ 
3336  2004 
if(idct_algo==FF_IDCT_AUTO  idct_algo==FF_IDCT_SIMPLEMMX){ 
3337  2005 
c>idct_put= ff_simple_idct_put_mmx; 
...  ...  
3382  2050 
} 
3383  2051 
} 
3384  2052  
3385 
#ifdef CONFIG_ENCODERS 

3386 
c>get_pixels = get_pixels_mmx; 

3387 
c>diff_pixels = diff_pixels_mmx; 

3388 
#endif //CONFIG_ENCODERS 

3389  2053 
c>put_pixels_clamped = put_pixels_clamped_mmx; 
3390  2054 
c>put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; 
3391  2055 
c>add_pixels_clamped = add_pixels_clamped_mmx; 
3392  2056 
c>clear_blocks = clear_blocks_mmx; 
3393 
#ifdef CONFIG_ENCODERS 

3394 
c>pix_sum = pix_sum16_mmx; 

3395 
#endif //CONFIG_ENCODERS 

3396  2057  
3397  2058 
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 
3398  2059 
c>PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ 
...  ...  
3413  2074  
3414  2075 
c>add_bytes= add_bytes_mmx; 
3415  2076 
c>add_bytes_l2= add_bytes_l2_mmx; 
3416 
#ifdef CONFIG_ENCODERS 

3417 
c>diff_bytes= diff_bytes_mmx; 

3418 
c>sum_abs_dctelem= sum_abs_dctelem_mmx; 

3419  
3420 
c>hadamard8_diff[0]= hadamard8_diff16_mmx; 

3421 
c>hadamard8_diff[1]= hadamard8_diff_mmx; 

3422  
3423 
c>pix_norm1 = pix_norm1_mmx; 

3424 
c>sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; 

3425 
c>sse[1] = sse8_mmx; 

3426 
c>vsad[4]= vsad_intra16_mmx; 

3427  
3428 
c>nsse[0] = nsse16_mmx; 

3429 
c>nsse[1] = nsse8_mmx; 

3430 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){ 

3431 
c>vsad[0] = vsad16_mmx; 

3432 
} 

3433  
3434 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){ 

3435 
c>try_8x8basis= try_8x8basis_mmx; 

3436 
} 

3437 
c>add_8x8basis= add_8x8basis_mmx; 

3438  
3439 
c>ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; 

3440  
3441 
#endif //CONFIG_ENCODERS 

3442  2077  
3443  2078 
if (ENABLE_ANY_H263) { 
3444  2079 
c>h263_v_loop_filter= h263_v_loop_filter_mmx; 
...  ...  
3472  2107 
c>avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; 
3473  2108 
c>avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; 
3474  2109  
3475 
#ifdef CONFIG_ENCODERS 

3476 
c>sum_abs_dctelem= sum_abs_dctelem_mmx2; 

3477 
c>hadamard8_diff[0]= hadamard8_diff16_mmx2; 

3478 
c>hadamard8_diff[1]= hadamard8_diff_mmx2; 

3479 
c>vsad[4]= vsad_intra16_mmx2; 

3480 
#endif //CONFIG_ENCODERS 

3481  
3482  2110 
c>h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; 
3483  2111 
c>h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; 
3484  2112  
...  ...  
3489  2117 
c>put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; 
3490  2118 
c>avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; 
3491  2119 
c>avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; 
3492 
#ifdef CONFIG_ENCODERS 

3493 
c>vsad[0] = vsad16_mmx2; 

3494 
#endif //CONFIG_ENCODERS 

3495  2120 
} 
3496  2121  
3497  2122 
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \ 
...  ...  
3568  2193 
ff_vc1dsp_init_mmx(c, avctx); 
3569  2194  
3570  2195 
c>add_png_paeth_prediction= add_png_paeth_prediction_mmx2; 
3571 
#ifdef CONFIG_ENCODERS 

3572 
c>sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; 

3573 
#endif //CONFIG_ENCODERS 

3574  2196 
} else if (mm_flags & MM_3DNOW) { 
3575  2197 
c>prefetch = prefetch_3dnow; 
3576  2198  
...  ...  
3666  2288 
} 
3667  2289 
#endif 
3668  2290  
3669 
#ifdef CONFIG_ENCODERS 

3670 
if(mm_flags & MM_SSE2){ 

3671 
c>sum_abs_dctelem= sum_abs_dctelem_sse2; 

3672 
c>hadamard8_diff[0]= hadamard8_diff16_sse2; 

3673 
c>hadamard8_diff[1]= hadamard8_diff_sse2; 

3674 
if (ENABLE_FLAC_ENCODER) 

3675 
c>flac_compute_autocorr = ff_flac_compute_autocorr_sse2; 

3676 
} 

3677  
3678 
#ifdef HAVE_SSSE3 

3679 
if(mm_flags & MM_SSSE3){ 

3680 
if(!(avctx>flags & CODEC_FLAG_BITEXACT)){ 
Also available in: Unified diff