Revision b926572a

View differences:

libavcodec/i386/dsputil_mmx.c
2832 2832
            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
2833 2833
            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
2834 2834

  
2835
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
2836
            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
2837
            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
2838
            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
2839
            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
2840
            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
2841
            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
2842
            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
2843

  
2844
            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
2845
            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
2846
            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
2847
            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
2848
            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
2849
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
2850
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
2851
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
2852

  
2835 2853
#ifdef CONFIG_ENCODERS
2836 2854
            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
2837 2855
#endif //CONFIG_ENCODERS
libavcodec/i386/h264dsp_mmx.c
909 909
#undef H264_CHROMA_OP
910 910
#undef H264_CHROMA_MC8_TMPL
911 911

  
912
/***********************************/
913
/* weighted prediction */
914

  
915
static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
916
{
917
    int x, y;
918
    offset <<= log2_denom;
919
    offset += (1 << log2_denom) >> 1;
920
    asm volatile(
921
        "movd    %0, %%mm4        \n\t"
922
        "movd    %1, %%mm5        \n\t"
923
        "movd    %2, %%mm6        \n\t"
924
        "pshufw  $0, %%mm4, %%mm4 \n\t"
925
        "pshufw  $0, %%mm5, %%mm5 \n\t"
926
        "pxor    %%mm7, %%mm7     \n\t"
927
        :: "g"(weight), "g"(offset), "g"(log2_denom)
928
    );
929
    for(y=0; y<h; y+=2){
930
        for(x=0; x<w; x+=4){
931
            asm volatile(
932
                "movd      %0,    %%mm0 \n\t"
933
                "movd      %1,    %%mm1 \n\t"
934
                "punpcklbw %%mm7, %%mm0 \n\t"
935
                "punpcklbw %%mm7, %%mm1 \n\t"
936
                "pmullw    %%mm4, %%mm0 \n\t"
937
                "pmullw    %%mm4, %%mm1 \n\t"
938
                "paddw     %%mm5, %%mm0 \n\t"
939
                "paddw     %%mm5, %%mm1 \n\t"
940
                "psraw     %%mm6, %%mm0 \n\t"
941
                "psraw     %%mm6, %%mm1 \n\t"
942
                "packuswb  %%mm7, %%mm0 \n\t"
943
                "packuswb  %%mm7, %%mm1 \n\t"
944
                "movd      %%mm0, %0    \n\t"
945
                "movd      %%mm1, %1    \n\t"
946
                : "+m"(*(uint32_t*)(dst+x)),
947
                  "+m"(*(uint32_t*)(dst+x+stride))
948
            );
949
        }
950
        dst += 2*stride;
951
    }
952
}
953

  
954
static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets, int w, int h)
955
{
956
    int x, y;
957
    int offset = ((offsets + offsetd + 1) | 1) << log2_denom;
958
    asm volatile(
959
        "movd    %0, %%mm3        \n\t"
960
        "movd    %1, %%mm4        \n\t"
961
        "movd    %2, %%mm5        \n\t"
962
        "movd    %3, %%mm6        \n\t"
963
        "pshufw  $0, %%mm3, %%mm3 \n\t"
964
        "pshufw  $0, %%mm4, %%mm4 \n\t"
965
        "pshufw  $0, %%mm5, %%mm5 \n\t"
966
        "pxor    %%mm7, %%mm7     \n\t"
967
        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
968
    );
969
    for(y=0; y<h; y++){
970
        for(x=0; x<w; x+=4){
971
            asm volatile(
972
                "movd      %0,    %%mm0 \n\t"
973
                "movd      %1,    %%mm1 \n\t"
974
                "punpcklbw %%mm7, %%mm0 \n\t"
975
                "punpcklbw %%mm7, %%mm1 \n\t"
976
                "pmullw    %%mm3, %%mm0 \n\t"
977
                "pmullw    %%mm4, %%mm1 \n\t"
978
                "paddw     %%mm5, %%mm0 \n\t"
979
                "paddw     %%mm1, %%mm0 \n\t"
980
                "psraw     %%mm6, %%mm0 \n\t"
981
                "packuswb  %%mm0, %%mm0 \n\t"
982
                "movd      %%mm0, %0    \n\t"
983
                : "+m"(*(uint32_t*)(dst+x))
984
                :  "m"(*(uint32_t*)(src+x))
985
            );
986
        }
987
        src += stride;
988
        dst += stride;
989
    }
990
}
991

  
992
#define H264_WEIGHT(W,H) \
993
static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
994
    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offsetd, offsets, W, H); \
995
} \
996
static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
997
    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
998
}
999

  
1000
H264_WEIGHT(16,16)
1001
H264_WEIGHT(16, 8)
1002
H264_WEIGHT( 8,16)
1003
H264_WEIGHT( 8, 8)
1004
H264_WEIGHT( 8, 4)
1005
H264_WEIGHT( 4, 8)
1006
H264_WEIGHT( 4, 4)
1007
H264_WEIGHT( 4, 2)
1008

  

Also available in: Unified diff