Revision be449fca libavcodec/i386/h264dsp_mmx.c
libavcodec/i386/h264dsp_mmx.c  

57  57 
static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) 
58  58 
{ 
59  59 
/* Load dct coeffs */ 
60 
asm volatile(


60 
__asm__ volatile(


61  61 
"movq (%0), %%mm0 \n\t" 
62  62 
"movq 8(%0), %%mm1 \n\t" 
63  63 
"movq 16(%0), %%mm2 \n\t" 
64  64 
"movq 24(%0), %%mm3 \n\t" 
65  65 
:: "r"(block) ); 
66  66  
67 
asm volatile(


67 
__asm__ volatile(


68  68 
/* mm1=s02+s13 mm2=s02s13 mm4=d02+d13 mm0=d02d13 */ 
69  69 
IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) 
70  70  
...  ...  
80  80 
"pxor %%mm7, %%mm7 \n\t" 
81  81 
:: "m"(ff_pw_32)); 
82  82  
83 
asm volatile(


83 
__asm__ volatile(


84  84 
STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) 
85  85 
"add %1, %0 \n\t" 
86  86 
STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) 
...  ...  
95  95  
96  96 
static inline void h264_idct8_1d(int16_t *block) 
97  97 
{ 
98 
asm volatile(


98 
__asm__ volatile(


99  99 
"movq 112(%0), %%mm7 \n\t" 
100  100 
"movq 80(%0), %%mm0 \n\t" 
101  101 
"movq 48(%0), %%mm3 \n\t" 
...  ...  
166  166  
167  167 
h264_idct8_1d(block+4*i); 
168  168  
169 
asm volatile(


169 
__asm__ volatile(


170  170 
"movq %%mm7, %0 \n\t" 
171  171 
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 
172  172 
"movq %%mm0, 8(%1) \n\t" 
...  ...  
188  188 
for(i=0; i<2; i++){ 
189  189 
h264_idct8_1d(b2+4*i); 
190  190  
191 
asm volatile(


191 
__asm__ volatile(


192  192 
"psraw $6, %%mm7 \n\t" 
193  193 
"psraw $6, %%mm6 \n\t" 
194  194 
"psraw $6, %%mm5 \n\t" 
...  ...  
269  269  
270  270 
static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) 
271  271 
{ 
272 
asm volatile(


272 
__asm__ volatile(


273  273 
"movdqa 0x10(%1), %%xmm1 \n" 
274  274 
"movdqa 0x20(%1), %%xmm2 \n" 
275  275 
"movdqa 0x30(%1), %%xmm3 \n" 
...  ...  
304  304 
static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 
305  305 
{ 
306  306 
int dc = (block[0] + 32) >> 6; 
307 
asm volatile(


307 
__asm__ volatile(


308  308 
"movd %0, %%mm0 \n\t" 
309  309 
"pshufw $0, %%mm0, %%mm0 \n\t" 
310  310 
"pxor %%mm1, %%mm1 \n\t" 
...  ...  
313  313 
"packuswb %%mm1, %%mm1 \n\t" 
314  314 
::"r"(dc) 
315  315 
); 
316 
asm volatile(


316 
__asm__ volatile(


317  317 
"movd %0, %%mm2 \n\t" 
318  318 
"movd %1, %%mm3 \n\t" 
319  319 
"movd %2, %%mm4 \n\t" 
...  ...  
341  341 
{ 
342  342 
int dc = (block[0] + 32) >> 6; 
343  343 
int y; 
344 
asm volatile(


344 
__asm__ volatile(


345  345 
"movd %0, %%mm0 \n\t" 
346  346 
"pshufw $0, %%mm0, %%mm0 \n\t" 
347  347 
"pxor %%mm1, %%mm1 \n\t" 
...  ...  
351  351 
::"r"(dc) 
352  352 
); 
353  353 
for(y=2; y; dst += 4*stride){ 
354 
asm volatile(


354 
__asm__ volatile(


355  355 
"movq %0, %%mm2 \n\t" 
356  356 
"movq %1, %%mm3 \n\t" 
357  357 
"movq %2, %%mm4 \n\t" 
...  ...  
463  463 
{ 
464  464 
DECLARE_ALIGNED_8(uint64_t, tmp0[2]); 
465  465  
466 
asm volatile(


466 
__asm__ volatile(


467  467 
"movq (%1,%3), %%mm0 \n\t" //p1 
468  468 
"movq (%1,%3,2), %%mm1 \n\t" //p0 
469  469 
"movq (%2), %%mm2 \n\t" //q0 
...  ...  
540  540  
541  541 
static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 
542  542 
{ 
543 
asm volatile(


543 
__asm__ volatile(


544  544 
"movq (%0), %%mm0 \n\t" //p1 
545  545 
"movq (%0,%2), %%mm1 \n\t" //p0 
546  546 
"movq (%1), %%mm2 \n\t" //q0 
...  ...  
586  586  
587  587 
static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) 
588  588 
{ 
589 
asm volatile(


589 
__asm__ volatile(


590  590 
"movq (%0), %%mm0 \n\t" 
591  591 
"movq (%0,%2), %%mm1 \n\t" 
592  592 
"movq (%1), %%mm2 \n\t" 
...  ...  
628  628 
static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 
629  629 
int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 
630  630 
int dir; 
631 
asm volatile(


631 
__asm__ volatile(


632  632 
"pxor %%mm7, %%mm7 \n\t" 
633  633 
"movq %0, %%mm6 \n\t" 
634  634 
"movq %1, %%mm5 \n\t" 
...  ...  
636  636 
::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) 
637  637 
); 
638  638 
if(field) 
639 
asm volatile(


639 
__asm__ volatile(


640  640 
"movq %0, %%mm5 \n\t" 
641  641 
"movq %1, %%mm4 \n\t" 
642  642 
::"m"(ff_pb_3_1), "m"(ff_pb_7_3) 
...  ...  
650  650 
DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 
651  651 
int b_idx, edge, l; 
652  652 
for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 
653 
asm volatile(


653 
__asm__ volatile(


654  654 
"pand %0, %%mm0 \n\t" 
655  655 
::"m"(mask_dir) 
656  656 
); 
657  657 
if(!(mask_mv & edge)) { 
658 
asm volatile("pxor %%mm0, %%mm0 \n\t":);


658 
__asm__ volatile("pxor %%mm0, %%mm0 \n\t":);


659  659 
for( l = bidir; l >= 0; l ) { 
660 
asm volatile(


660 
__asm__ volatile(


661  661 
"movd %0, %%mm1 \n\t" 
662  662 
"punpckldq %1, %%mm1 \n\t" 
663  663 
"movq %%mm1, %%mm2 \n\t" 
...  ...  
688  688 
); 
689  689 
} 
690  690 
} 
691 
asm volatile(


691 
__asm__ volatile(


692  692 
"movd %0, %%mm1 \n\t" 
693  693 
"por %1, %%mm1 \n\t" 
694  694 
"punpcklbw %%mm7, %%mm1 \n\t" 
...  ...  
696  696 
::"m"(nnz[b_idx]), 
697  697 
"m"(nnz[b_idx+d_idx]) 
698  698 
); 
699 
asm volatile(


699 
__asm__ volatile(


700  700 
"pcmpeqw %%mm7, %%mm0 \n\t" 
701  701 
"pcmpeqw %%mm7, %%mm0 \n\t" 
702  702 
"psrlw $15, %%mm0 \n\t" // nonzero > 1 
...  ...  
713  713 
edges = 4; 
714  714 
step = 1; 
715  715 
} 
716 
asm volatile(


716 
__asm__ volatile(


717  717 
"movq (%0), %%mm0 \n\t" 
718  718 
"movq 8(%0), %%mm1 \n\t" 
719  719 
"movq 16(%0), %%mm2 \n\t" 
...  ...  
774  774 
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 
775  775 
int h=4;\ 
776  776 
\ 
777 
asm volatile(\


777 
__asm__ volatile(\


778  778 
"pxor %%mm7, %%mm7 \n\t"\ 
779  779 
"movq %5, %%mm4 \n\t"\ 
780  780 
"movq %6, %%mm5 \n\t"\ 
...  ...  
813  813 
}\ 
814  814 
static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 
815  815 
int h=4;\ 
816 
asm volatile(\


816 
__asm__ volatile(\


817  817 
"pxor %%mm7, %%mm7 \n\t"\ 
818  818 
"movq %0, %%mm4 \n\t"\ 
819  819 
"movq %1, %%mm5 \n\t"\ 
820  820 
:: "m"(ff_pw_5), "m"(ff_pw_16)\ 
821  821 
);\ 
822  822 
do{\ 
823 
asm volatile(\


823 
__asm__ volatile(\


824  824 
"movd 1(%0), %%mm1 \n\t"\ 
825  825 
"movd (%0), %%mm2 \n\t"\ 
826  826 
"movd 1(%0), %%mm3 \n\t"\ 
...  ...  
857  857 
}\ 
858  858 
static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 
859  859 
src = 2*srcStride;\ 
860 
asm volatile(\


860 
__asm__ volatile(\


861  861 
"pxor %%mm7, %%mm7 \n\t"\ 
862  862 
"movd (%0), %%mm0 \n\t"\ 
863  863 
"add %2, %0 \n\t"\ 
...  ...  
889  889 
int w=3;\ 
890  890 
src = 2*srcStride+2;\ 
891  891 
while(w){\ 
892 
asm volatile(\


892 
__asm__ volatile(\


893  893 
"pxor %%mm7, %%mm7 \n\t"\ 
894  894 
"movd (%0), %%mm0 \n\t"\ 
895  895 
"add %2, %0 \n\t"\ 
...  ...  
919  919 
src += 4  9*srcStride;\ 
920  920 
}\ 
921  921 
tmp = 3*4;\ 
922 
asm volatile(\


922 
__asm__ volatile(\


923  923 
"1: \n\t"\ 
924  924 
"movq (%0), %%mm0 \n\t"\ 
925  925 
"paddw 10(%0), %%mm0 \n\t"\ 
...  ...  
948  948 
\ 
949  949 
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 
950  950 
int h=8;\ 
951 
asm volatile(\


951 
__asm__ volatile(\


952  952 
"pxor %%mm7, %%mm7 \n\t"\ 
953  953 
"movq %5, %%mm6 \n\t"\ 
954  954 
"1: \n\t"\ 
...  ...  
1005  1005 
\ 
1006  1006 
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 
1007  1007 
int h=8;\ 
1008 
asm volatile(\


1008 
__asm__ volatile(\


1009  1009 
"pxor %%mm7, %%mm7 \n\t"\ 
1010  1010 
"movq %0, %%mm6 \n\t"\ 
1011  1011 
:: "m"(ff_pw_5)\ 
1012  1012 
);\ 
1013  1013 
do{\ 
1014 
asm volatile(\


1014 
__asm__ volatile(\


1015  1015 
"movq (%0), %%mm0 \n\t"\ 
1016  1016 
"movq 1(%0), %%mm2 \n\t"\ 
1017  1017 
"movq %%mm0, %%mm1 \n\t"\ 
...  ...  
1071  1071 
src = 2*srcStride;\ 
1072  1072 
\ 
1073  1073 
while(w){\ 
1074 
asm volatile(\


1074 
__asm__ volatile(\


1075  1075 
"pxor %%mm7, %%mm7 \n\t"\ 
1076  1076 
"movd (%0), %%mm0 \n\t"\ 
1077  1077 
"add %2, %0 \n\t"\ 
...  ...  
1102  1102 
: "memory"\ 
1103  1103 
);\ 
1104  1104 
if(h==16){\ 
1105 
asm volatile(\


1105 
__asm__ volatile(\


1106  1106 
QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 
1107  1107 
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 
1108  1108 
QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 
...  ...  
1125  1125 
int w = (size+8)>>2;\ 
1126  1126 
src = 2*srcStride+2;\ 
1127  1127 
while(w){\ 
1128 
asm volatile(\


1128 
__asm__ volatile(\


1129  1129 
"pxor %%mm7, %%mm7 \n\t"\ 
1130  1130 
"movd (%0), %%mm0 \n\t"\ 
1131  1131 
"add %2, %0 \n\t"\ 
...  ...  
1155  1155 
: "memory"\ 
1156  1156 
);\ 
1157  1157 
if(size==16){\ 
1158 
asm volatile(\


1158 
__asm__ volatile(\


1159  1159 
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ 
1160  1160 
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ 
1161  1161 
QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ 
...  ...  
1177  1177 
int w = size>>4;\ 
1178  1178 
do{\ 
1179  1179 
int h = size;\ 
1180 
asm volatile(\


1180 
__asm__ volatile(\


1181  1181 
"1: \n\t"\ 
1182  1182 
"movq (%0), %%mm0 \n\t"\ 
1183  1183 
"movq 8(%0), %%mm3 \n\t"\ 
...  ...  
1261  1261 
\ 
1262  1262 
static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 
1263  1263 
{\ 
1264 
asm volatile(\


1264 
__asm__ volatile(\


1265  1265 
"movq (%1), %%mm0 \n\t"\ 
1266  1266 
"movq 24(%1), %%mm1 \n\t"\ 
1267  1267 
"psraw $5, %%mm0 \n\t"\ 
...  ...  
1291  1291 
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 
1292  1292 
{\ 
1293  1293 
do{\ 
1294 
asm volatile(\


1294 
__asm__ volatile(\


1295  1295 
"movq (%1), %%mm0 \n\t"\ 
1296  1296 
"movq 8(%1), %%mm1 \n\t"\ 
1297  1297 
"movq 48(%1), %%mm2 \n\t"\ 
...  ...  
1325  1325 
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 
1326  1326 
static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 
1327  1327 
int h=16;\ 
1328 
asm volatile(\


1328 
__asm__ volatile(\


1329  1329 
"pxor %%xmm15, %%xmm15 \n\t"\ 
1330  1330 
"movdqa %6, %%xmm14 \n\t"\ 
1331  1331 
"movdqa %7, %%xmm13 \n\t"\ 
...  ...  
1403  1403 
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 
1404  1404 
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 
1405  1405 
int h=8;\ 
1406 
asm volatile(\


1406 
__asm__ volatile(\


1407  1407 
"pxor %%xmm7, %%xmm7 \n\t"\ 
1408  1408 
"movdqa %0, %%xmm6 \n\t"\ 
1409  1409 
:: "m"(ff_pw_5)\ 
1410  1410 
);\ 
1411  1411 
do{\ 
1412 
asm volatile(\


1412 
__asm__ volatile(\


1413  1413 
"lddqu 5(%0), %%xmm1 \n\t"\ 
1414  1414 
"movdqa %%xmm1, %%xmm0 \n\t"\ 
1415  1415 
"punpckhbw %%xmm7, %%xmm1 \n\t"\ 
...  ...  
1450  1450 
\ 
1451  1451 
static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 
1452  1452 
int h=8;\ 
1453 
asm volatile(\


1453 
__asm__ volatile(\


1454  1454 
"pxor %%xmm7, %%xmm7 \n\t"\ 
1455  1455 
"movdqa %5, %%xmm6 \n\t"\ 
1456  1456 
"1: \n\t"\ 
...  ...  
1501  1501 
static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 
1502  1502 
src = 2*srcStride;\ 
1503  1503 
\ 
1504 
asm volatile(\


1504 
__asm__ volatile(\


1505  1505 
"pxor %%xmm7, %%xmm7 \n\t"\ 
1506  1506 
"movq (%0), %%xmm0 \n\t"\ 
1507  1507 
"add %2, %0 \n\t"\ 
...  ...  
1532  1532 
: "memory"\ 
1533  1533 
);\ 
1534  1534 
if(h==16){\ 
1535 
asm volatile(\


1535 
__asm__ volatile(\


1536  1536 
QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 
1537  1537 
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 
1538  1538 
QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 
...  ...  
1560  1560 
int w = (size+8)>>3; 
1561  1561 
src = 2*srcStride+2; 
1562  1562 
while(w){ 
1563 
asm volatile(


1563 
__asm__ volatile(


1564  1564 
"pxor %%xmm7, %%xmm7 \n\t" 
1565  1565 
"movq (%0), %%xmm0 \n\t" 
1566  1566 
"add %2, %0 \n\t" 
...  ...  
1590  1590 
: "memory" 
1591  1591 
); 
1592  1592 
if(size==16){ 
1593 
asm volatile(


1593 
__asm__ volatile(


1594  1594 
QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) 
1595  1595 
QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) 
1596  1596 
QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) 
...  ...  
1613  1613 
static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 
1614  1614 
int h = size;\ 
1615  1615 
if(size == 16){\ 
1616 
asm volatile(\


1616 
__asm__ volatile(\


1617  1617 
"1: \n\t"\ 
1618  1618 
"movdqa 32(%0), %%xmm4 \n\t"\ 
1619  1619 
"movdqa 16(%0), %%xmm5 \n\t"\ 
...  ...  
1668  1668 
: "memory"\ 
1669  1669 
);\ 
1670  1670 
}else{\ 
1671 
asm volatile(\


1671 
__asm__ volatile(\


1672  1672 
"1: \n\t"\ 
1673  1673 
"movdqa 16(%0), %%xmm1 \n\t"\ 
1674  1674 
"movdqa (%0), %%xmm0 \n\t"\ 
...  ...  
2022  2022 
int x, y; 
2023  2023 
offset <<= log2_denom; 
2024  2024 
offset += (1 << log2_denom) >> 1; 
2025 
asm volatile(


2025 
__asm__ volatile(


2026  2026 
"movd %0, %%mm4 \n\t" 
2027  2027 
"movd %1, %%mm5 \n\t" 
2028  2028 
"movd %2, %%mm6 \n\t" 
...  ...  
2033  2033 
); 
2034  2034 
for(y=0; y<h; y+=2){ 
2035  2035 
for(x=0; x<w; x+=4){ 
2036 
asm volatile(


2036 
__asm__ volatile(


2037  2037 
"movd %0, %%mm0 \n\t" 
2038  2038 
"movd %1, %%mm1 \n\t" 
2039  2039 
"punpcklbw %%mm7, %%mm0 \n\t" 
...  ...  
2060  2060 
{ 
2061  2061 
int x, y; 
2062  2062 
offset = ((offset + 1)  1) << log2_denom; 
2063 
asm volatile(


2063 
__asm__ volatile(


2064  2064 
"movd %0, %%mm3 \n\t" 
2065  2065 
"movd %1, %%mm4 \n\t" 
2066  2066 
"movd %2, %%mm5 \n\t" 
...  ...  
2073  2073 
); 
2074  2074 
for(y=0; y<h; y++){ 
2075  2075 
for(x=0; x<w; x+=4){ 
2076 
asm volatile(


2076 
__asm__ volatile(


2077  2077 
"movd %0, %%mm0 \n\t" 
2078  2078 
"movd %1, %%mm1 \n\t" 
2079  2079 
"punpcklbw %%mm7, %%mm0 \n\t" 
Also available in: Unified diff