Revision b4c2ada5

View differences:

libavcodec/x86/h264dsp_mmx.c
812 812
    // could do a special case for dir==0 && edges==1, but it only reduces the
813 813
    // average filter time by 1.2%
814 814
    for( dir=1; dir>=0; dir-- ) {
815
        const int d_idx = dir ? -8 : -1;
815
        const x86_reg d_idx = dir ? -8 : -1;
816 816
        const int mask_mv = dir ? mask_mv1 : mask_mv0;
817 817
        DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
818 818
        int b_idx, edge, l;
......
825 825
                __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
826 826
                for( l = bidir; l >= 0; l-- ) {
827 827
                    __asm__ volatile(
828
                        "movd %0, %%mm1 \n\t"
829
                        "punpckldq %1, %%mm1 \n\t"
828
                        "movd (%0), %%mm1 \n\t"
829
                        "punpckldq (%0,%1), %%mm1 \n\t"
830 830
                        "punpckldq %%mm1, %%mm2 \n\t"
831 831
                        "pcmpeqb %%mm2, %%mm1 \n\t"
832 832
                        "paddb %%mm6, %%mm1 \n\t"
833 833
                        "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
834 834
                        "por %%mm1, %%mm0 \n\t"
835 835

  
836
                        "movq %2, %%mm1 \n\t"
837
                        "movq 8+1*%2, %%mm2 \n\t"
838
                        "psubw %3, %%mm1 \n\t"
839
                        "psubw 8+1*%3, %%mm2 \n\t"
836
                        "movq (%2), %%mm1 \n\t"
837
                        "movq 8(%2), %%mm2 \n\t"
838
                        "psubw (%2,%1,4), %%mm1 \n\t"
839
                        "psubw 8(%2,%1,4), %%mm2 \n\t"
840 840
                        "packsswb %%mm2, %%mm1 \n\t"
841 841
                        "paddb %%mm5, %%mm1 \n\t"
842 842
                        "pminub %%mm4, %%mm1 \n\t"
843 843
                        "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
844 844
                        "por %%mm1, %%mm0 \n\t"
845
                        ::"m"(ref[l][b_idx]),
846
                          "m"(ref[l][b_idx+d_idx]),
847
                          "m"(mv[l][b_idx][0]),
848
                          "m"(mv[l][b_idx+d_idx][0])
845
                        ::"r"(ref[l]+b_idx),
846
                          "r"(d_idx),
847
                          "r"(mv[l]+b_idx)
849 848
                    );
850 849
                }
851 850
                if(bidir==1){
852 851
                    __asm__ volatile("pxor %%mm3, %%mm3 \n\t":);
853 852
                    for( l = bidir; l >= 0; l-- ) {
854 853
                    __asm__ volatile(
855
                        "movd %0, %%mm1 \n\t"
856
                        "punpckldq %1, %%mm1 \n\t"
854
                        "movd (%0), %%mm1 \n\t"
855
                        "punpckldq (%1), %%mm1 \n\t"
857 856
                        "punpckldq %%mm1, %%mm2 \n\t"
858 857
                        "pcmpeqb %%mm2, %%mm1 \n\t"
859 858
                        "paddb %%mm6, %%mm1 \n\t"
860 859
                        "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn]
861 860
                        "por %%mm1, %%mm3 \n\t"
862 861

  
863
                        "movq %2, %%mm1 \n\t"
864
                        "movq 8+1*%2, %%mm2 \n\t"
865
                        "psubw %3, %%mm1 \n\t"
866
                        "psubw 8+1*%3, %%mm2 \n\t"
862
                        "movq (%2), %%mm1 \n\t"
863
                        "movq 8(%2), %%mm2 \n\t"
864
                        "psubw (%3), %%mm1 \n\t"
865
                        "psubw 8(%3), %%mm2 \n\t"
867 866
                        "packsswb %%mm2, %%mm1 \n\t"
868 867
                        "paddb %%mm5, %%mm1 \n\t"
869 868
                        "pminub %%mm4, %%mm1 \n\t"
870 869
                        "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit
871 870
                        "por %%mm1, %%mm3 \n\t"
872
                        ::"m"(ref[l][b_idx]),
873
                          "m"(ref[1-l][b_idx+d_idx]),
874
                          "m"(mv[l][b_idx][0]),
875
                          "m"(mv[1-l][b_idx+d_idx][0])
871
                        ::"r"(ref[l]+b_idx),
872
                          "r"(ref[1-l]+b_idx+d_idx),
873
                          "r"(mv[l][b_idx]),
874
                          "r"(mv[1-l][b_idx+d_idx])
876 875
                    );
877 876
                    }
878 877
                    __asm__ volatile(

Also available in: Unified diff