Revision b4c2ada5 libavcodec/x86/h264dsp_mmx.c
libavcodec/x86/h264dsp_mmx.c | ||
---|---|---|
812 | 812 |
// could do a special case for dir==0 && edges==1, but it only reduces the |
813 | 813 |
// average filter time by 1.2% |
814 | 814 |
for( dir=1; dir>=0; dir-- ) { |
815 |
const int d_idx = dir ? -8 : -1;
|
|
815 |
const x86_reg d_idx = dir ? -8 : -1;
|
|
816 | 816 |
const int mask_mv = dir ? mask_mv1 : mask_mv0; |
817 | 817 |
DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; |
818 | 818 |
int b_idx, edge, l; |
... | ... | |
825 | 825 |
__asm__ volatile("pxor %%mm0, %%mm0 \n\t":); |
826 | 826 |
for( l = bidir; l >= 0; l-- ) { |
827 | 827 |
__asm__ volatile( |
828 |
"movd %0, %%mm1 \n\t"
|
|
829 |
"punpckldq %1, %%mm1 \n\t"
|
|
828 |
"movd (%0), %%mm1 \n\t"
|
|
829 |
"punpckldq (%0,%1), %%mm1 \n\t"
|
|
830 | 830 |
"punpckldq %%mm1, %%mm2 \n\t" |
831 | 831 |
"pcmpeqb %%mm2, %%mm1 \n\t" |
832 | 832 |
"paddb %%mm6, %%mm1 \n\t" |
833 | 833 |
"punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] |
834 | 834 |
"por %%mm1, %%mm0 \n\t" |
835 | 835 |
|
836 |
"movq %2, %%mm1 \n\t"
|
|
837 |
"movq 8+1*%2, %%mm2 \n\t"
|
|
838 |
"psubw %3, %%mm1 \n\t"
|
|
839 |
"psubw 8+1*%3, %%mm2 \n\t"
|
|
836 |
"movq (%2), %%mm1 \n\t"
|
|
837 |
"movq 8(%2), %%mm2 \n\t"
|
|
838 |
"psubw (%2,%1,4), %%mm1 \n\t"
|
|
839 |
"psubw 8(%2,%1,4), %%mm2 \n\t"
|
|
840 | 840 |
"packsswb %%mm2, %%mm1 \n\t" |
841 | 841 |
"paddb %%mm5, %%mm1 \n\t" |
842 | 842 |
"pminub %%mm4, %%mm1 \n\t" |
843 | 843 |
"pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit |
844 | 844 |
"por %%mm1, %%mm0 \n\t" |
845 |
::"m"(ref[l][b_idx]), |
|
846 |
"m"(ref[l][b_idx+d_idx]), |
|
847 |
"m"(mv[l][b_idx][0]), |
|
848 |
"m"(mv[l][b_idx+d_idx][0]) |
|
845 |
::"r"(ref[l]+b_idx), |
|
846 |
"r"(d_idx), |
|
847 |
"r"(mv[l]+b_idx) |
|
849 | 848 |
); |
850 | 849 |
} |
851 | 850 |
if(bidir==1){ |
852 | 851 |
__asm__ volatile("pxor %%mm3, %%mm3 \n\t":); |
853 | 852 |
for( l = bidir; l >= 0; l-- ) { |
854 | 853 |
__asm__ volatile( |
855 |
"movd %0, %%mm1 \n\t"
|
|
856 |
"punpckldq %1, %%mm1 \n\t"
|
|
854 |
"movd (%0), %%mm1 \n\t"
|
|
855 |
"punpckldq (%1), %%mm1 \n\t"
|
|
857 | 856 |
"punpckldq %%mm1, %%mm2 \n\t" |
858 | 857 |
"pcmpeqb %%mm2, %%mm1 \n\t" |
859 | 858 |
"paddb %%mm6, %%mm1 \n\t" |
860 | 859 |
"punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] |
861 | 860 |
"por %%mm1, %%mm3 \n\t" |
862 | 861 |
|
863 |
"movq %2, %%mm1 \n\t"
|
|
864 |
"movq 8+1*%2, %%mm2 \n\t"
|
|
865 |
"psubw %3, %%mm1 \n\t"
|
|
866 |
"psubw 8+1*%3, %%mm2 \n\t"
|
|
862 |
"movq (%2), %%mm1 \n\t"
|
|
863 |
"movq 8(%2), %%mm2 \n\t"
|
|
864 |
"psubw (%3), %%mm1 \n\t"
|
|
865 |
"psubw 8(%3), %%mm2 \n\t"
|
|
867 | 866 |
"packsswb %%mm2, %%mm1 \n\t" |
868 | 867 |
"paddb %%mm5, %%mm1 \n\t" |
869 | 868 |
"pminub %%mm4, %%mm1 \n\t" |
870 | 869 |
"pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit |
871 | 870 |
"por %%mm1, %%mm3 \n\t" |
872 |
::"m"(ref[l][b_idx]),
|
|
873 |
"m"(ref[1-l][b_idx+d_idx]),
|
|
874 |
"m"(mv[l][b_idx][0]),
|
|
875 |
"m"(mv[1-l][b_idx+d_idx][0])
|
|
871 |
::"r"(ref[l]+b_idx),
|
|
872 |
"r"(ref[1-l]+b_idx+d_idx),
|
|
873 |
"r"(mv[l][b_idx]),
|
|
874 |
"r"(mv[1-l][b_idx+d_idx])
|
|
876 | 875 |
); |
877 | 876 |
} |
878 | 877 |
__asm__ volatile( |
Also available in: Unified diff