Revision 900479bb
libavcodec/x86/h264dsp_mmx.c  

796  796 
int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 
797  797 
int dir; 
798  798 
__asm__ volatile( 
799 
"pxor %%mm7, %%mm7 \n\t" 

800 
"movq %0, %%mm6 \n\t" 

801 
"movq %1, %%mm5 \n\t" 

802 
"movq %2, %%mm4 \n\t" 

803 
::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) 

799 
"movq %0, %%mm7 \n" 

800 
"movq %1, %%mm6 \n" 

801 
::"m"(ff_pb_1), "m"(ff_pb_3) 

804  802 
); 
805  803 
if(field) 
806  804 
__asm__ volatile( 
807 
"movq %0, %%mm5 \n\t" 

808 
"movq %1, %%mm4 \n\t" 

809 
::"m"(ff_pb_3_1), "m"(ff_pb_7_3) 

805 
"movq %0, %%mm6 \n" 

806 
::"m"(ff_pb_3_1) 

810  807 
); 
808 
__asm__ volatile( 

809 
"movq %%mm6, %%mm5 \n" 

810 
"paddb %%mm5, %%mm5 \n" 

811 
:); 

811  812  
812  813 
// could do a special case for dir==0 && edges==1, but it only reduces the 
813  814 
// average filter time by 1.2% 
...  ...  
815  816 
const x86_reg d_idx = dir ? 8 : 1; 
816  817 
const int mask_mv = dir ? mask_mv1 : mask_mv0; 
817  818 
DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 
818 
int b_idx, edge, l;


819 
int b_idx, edge; 

819  820 
for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 
820  821 
__asm__ volatile( 
821  822 
"pand %0, %%mm0 \n\t" 
822  823 
::"m"(mask_dir) 
823  824 
); 
824  825 
if(!(mask_mv & edge)) { 
825 
__asm__ volatile("pxor %%mm0, %%mm0 \n\t":); 

826 
for( l = bidir; l >= 0; l ) { 

826 
if(bidir) { 

827  827 
__asm__ volatile( 
828 
"movd (%0), %%mm1 \n\t" 

829 
"punpckldq (%0,%1), %%mm1 \n\t" 

830 
"punpckldq %%mm1, %%mm2 \n\t" 

831 
"pcmpeqb %%mm2, %%mm1 \n\t" 

832 
"paddb %%mm6, %%mm1 \n\t" 

833 
"punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] 

834 
"por %%mm1, %%mm0 \n\t" 

835  
836 
"movq (%2), %%mm1 \n\t" 

837 
"movq 8(%2), %%mm2 \n\t" 

838 
"psubw (%2,%1,4), %%mm1 \n\t" 

839 
"psubw 8(%2,%1,4), %%mm2 \n\t" 

840 
"packsswb %%mm2, %%mm1 \n\t" 

841 
"paddb %%mm5, %%mm1 \n\t" 

842 
"pminub %%mm4, %%mm1 \n\t" 

843 
"pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b]  mv[bn]) >= limit 

844 
"por %%mm1, %%mm0 \n\t" 

845 
::"r"(ref[l]+b_idx), 

846 
"r"(d_idx), 

847 
"r"(mv[l]+b_idx) 

828 
"movd (%1,%0), %%mm2 \n" 

829 
"punpckldq 40(%1,%0), %%mm2 \n" // { ref0[bn], ref1[bn] } 

830 
"pshufw $0x44, (%1), %%mm0 \n" // { ref0[b], ref0[b] } 

831 
"pshufw $0x44, 40(%1), %%mm1 \n" // { ref1[b], ref1[b] } 

832 
"pshufw $0x4E, %%mm2, %%mm3 \n" 

833 
"psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } 

834 
"psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } 

835 
"1: \n" 

836 
"por %%mm1, %%mm0 \n" 

837 
"movq (%2,%0,4), %%mm1 \n" 

838 
"movq 8(%2,%0,4), %%mm2 \n" 

839 
"movq %%mm1, %%mm3 \n" 

840 
"movq %%mm2, %%mm4 \n" 

841 
"psubw (%2), %%mm1 \n" 

842 
"psubw 8(%2), %%mm2 \n" 

843 
"psubw 160(%2), %%mm3 \n" 

844 
"psubw 168(%2), %%mm4 \n" 

845 
"packsswb %%mm2, %%mm1 \n" 

846 
"packsswb %%mm4, %%mm3 \n" 

847 
"paddb %%mm6, %%mm1 \n" 

848 
"paddb %%mm6, %%mm3 \n" 

849 
"psubusb %%mm5, %%mm1 \n" // abs(mv[b]  mv[bn]) >= limit 

850 
"psubusb %%mm5, %%mm3 \n" 

851 
"packsswb %%mm3, %%mm1 \n" 

852 
"add $40, %0 \n" 

853 
"cmp $40, %0 \n" 

854 
"jl 1b \n" 

855 
"sub $80, %0 \n" 

856 
"pshufw $0x4E, %%mm1, %%mm1 \n" 

857 
"por %%mm1, %%mm0 \n" 

858 
"pshufw $0x4E, %%mm0, %%mm1 \n" 

859 
"pminub %%mm1, %%mm0 \n" 

860 
::"r"(d_idx), 

861 
"r"(ref[0]+b_idx), 

862 
"r"(mv[0]+b_idx) 

848  863 
); 
849 
} 

850 
if(bidir==1){ 

851 
__asm__ volatile("pxor %%mm3, %%mm3 \n\t":); 

852 
for( l = bidir; l >= 0; l ) { 

864 
} else { 

853  865 
__asm__ volatile( 
854 
"movd (%0), %%mm1 \n\t" 

855 
"punpckldq (%1), %%mm1 \n\t" 

856 
"punpckldq %%mm1, %%mm2 \n\t" 

857 
"pcmpeqb %%mm2, %%mm1 \n\t" 

858 
"paddb %%mm6, %%mm1 \n\t" 

859 
"punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] 

860 
"por %%mm1, %%mm3 \n\t" 

861  
862 
"movq (%2), %%mm1 \n\t" 

863 
"movq 8(%2), %%mm2 \n\t" 

864 
"psubw (%3), %%mm1 \n\t" 

865 
"psubw 8(%3), %%mm2 \n\t" 

866 
"packsswb %%mm2, %%mm1 \n\t" 

867 
"paddb %%mm5, %%mm1 \n\t" 

868 
"pminub %%mm4, %%mm1 \n\t" 

869 
"pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b]  mv[bn]) >= limit 

870 
"por %%mm1, %%mm3 \n\t" 

871 
::"r"(ref[l]+b_idx), 

872 
"r"(ref[1l]+b_idx+d_idx), 

873 
"r"(mv[l][b_idx]), 

874 
"r"(mv[1l][b_idx+d_idx]) 

866 
"movd (%1), %%mm0 \n" 

867 
"psubb (%1,%0), %%mm0 \n" // ref[b] != ref[bn] 

868 
"movq (%2), %%mm1 \n" 

869 
"movq 8(%2), %%mm2 \n" 

870 
"psubw (%2,%0,4), %%mm1 \n" 

871 
"psubw 8(%2,%0,4), %%mm2 \n" 

872 
"packsswb %%mm2, %%mm1 \n" 

873 
"paddb %%mm6, %%mm1 \n" 

874 
"psubusb %%mm5, %%mm1 \n" // abs(mv[b]  mv[bn]) >= limit 

875 
"packsswb %%mm1, %%mm1 \n" 

876 
"por %%mm1, %%mm0 \n" 

877 
::"r"(d_idx), 

878 
"r"(ref[0]+b_idx), 

879 
"r"(mv[0]+b_idx) 

875  880 
); 
876 
} 

877 
__asm__ volatile( 

878 
"pcmpeqw %%mm7, %%mm3 \n\t" 

879 
"psubusw %%mm3, %%mm0 \n\t" 

880 
:); 

881  881 
} 
882  882 
} 
883  883 
__asm__ volatile( 
884 
"movd %0, %%mm1 \n\t" 

885 
"por %1, %%mm1 \n\t" 

886 
"punpcklbw %%mm7, %%mm1 \n\t" 

887 
"pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b]  nnz[bn] 

884 
"movd %0, %%mm1 \n" 

885 
"por %1, %%mm1 \n" // nnz[b]  nnz[bn] 

888  886 
::"m"(nnz[b_idx]), 
889  887 
"m"(nnz[b_idx+d_idx]) 
890  888 
); 
891  889 
__asm__ volatile( 
892 
"pcmpeqw %%mm7, %%mm0 \n\t" 

893 
"pcmpeqw %%mm7, %%mm0 \n\t" 

894 
"psrlw $15, %%mm0 \n\t" // nonzero > 1 

895 
"psrlw $14, %%mm1 \n\t" 

896 
"movq %%mm0, %%mm2 \n\t" 

897 
"por %%mm1, %%mm2 \n\t" 

898 
"psrlw $1, %%mm1 \n\t" 

899 
"pandn %%mm2, %%mm1 \n\t" 

900 
"movq %%mm1, %0 \n\t" 

890 
"pminub %%mm7, %%mm1 \n" 

891 
"pminub %%mm7, %%mm0 \n" 

892 
"psllw $1, %%mm1 \n" 

893 
"pxor %%mm2, %%mm2 \n" 

894 
"pmaxub %%mm0, %%mm1 \n" 

895 
"punpcklbw %%mm2, %%mm1 \n" 

896 
"movq %%mm1, %0 \n" 

901  897 
:"=m"(*bS[dir][edge]) 
902  898 
::"memory" 
903  899 
); 
Also available in: Unified diff