Revision a33a2562 libavcodec/x86/h264dsp_mmx.c

View differences:

libavcodec/x86/h264dsp_mmx.c
921 921
/***********************************/
922 922
/* weighted prediction */
923 923

  
924
static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
925
{
926
    int x, y;
927
    offset <<= log2_denom;
928
    offset += (1 << log2_denom) >> 1;
929
    __asm__ volatile(
930
        "movd    %0, %%mm4        \n\t"
931
        "movd    %1, %%mm5        \n\t"
932
        "movd    %2, %%mm6        \n\t"
933
        "pshufw  $0, %%mm4, %%mm4 \n\t"
934
        "pshufw  $0, %%mm5, %%mm5 \n\t"
935
        "pxor    %%mm7, %%mm7     \n\t"
936
        :: "g"(weight), "g"(offset), "g"(log2_denom)
937
    );
938
    for(y=0; y<h; y+=2){
939
        for(x=0; x<w; x+=4){
940
            __asm__ volatile(
941
                "movd      %0,    %%mm0 \n\t"
942
                "movd      %1,    %%mm1 \n\t"
943
                "punpcklbw %%mm7, %%mm0 \n\t"
944
                "punpcklbw %%mm7, %%mm1 \n\t"
945
                "pmullw    %%mm4, %%mm0 \n\t"
946
                "pmullw    %%mm4, %%mm1 \n\t"
947
                "paddsw    %%mm5, %%mm0 \n\t"
948
                "paddsw    %%mm5, %%mm1 \n\t"
949
                "psraw     %%mm6, %%mm0 \n\t"
950
                "psraw     %%mm6, %%mm1 \n\t"
951
                "packuswb  %%mm7, %%mm0 \n\t"
952
                "packuswb  %%mm7, %%mm1 \n\t"
953
                "movd      %%mm0, %0    \n\t"
954
                "movd      %%mm1, %1    \n\t"
955
                : "+m"(*(uint32_t*)(dst+x)),
956
                  "+m"(*(uint32_t*)(dst+x+stride))
957
            );
958
        }
959
        dst += 2*stride;
960
    }
961
}
962

  
963
static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
964
{
965
    int x, y;
966
    offset = ((offset + 1) | 1) << log2_denom;
967
    __asm__ volatile(
968
        "movd    %0, %%mm3        \n\t"
969
        "movd    %1, %%mm4        \n\t"
970
        "movd    %2, %%mm5        \n\t"
971
        "movd    %3, %%mm6        \n\t"
972
        "pshufw  $0, %%mm3, %%mm3 \n\t"
973
        "pshufw  $0, %%mm4, %%mm4 \n\t"
974
        "pshufw  $0, %%mm5, %%mm5 \n\t"
975
        "pxor    %%mm7, %%mm7     \n\t"
976
        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
977
    );
978
    for(y=0; y<h; y++){
979
        for(x=0; x<w; x+=4){
980
            __asm__ volatile(
981
                "movd      %0,    %%mm0 \n\t"
982
                "movd      %1,    %%mm1 \n\t"
983
                "punpcklbw %%mm7, %%mm0 \n\t"
984
                "punpcklbw %%mm7, %%mm1 \n\t"
985
                "pmullw    %%mm3, %%mm0 \n\t"
986
                "pmullw    %%mm4, %%mm1 \n\t"
987
                "paddsw    %%mm1, %%mm0 \n\t"
988
                "paddsw    %%mm5, %%mm0 \n\t"
989
                "psraw     %%mm6, %%mm0 \n\t"
990
                "packuswb  %%mm0, %%mm0 \n\t"
991
                "movd      %%mm0, %0    \n\t"
992
                : "+m"(*(uint32_t*)(dst+x))
993
                :  "m"(*(uint32_t*)(src+x))
994
            );
995
        }
996
        src += stride;
997
        dst += stride;
998
    }
999
}
1000

  
1001
#define H264_WEIGHT(W,H) \
1002
static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
1003
    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
1004
} \
1005
static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
1006
    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
1007
}
1008

  
1009
H264_WEIGHT(16,16)
1010
H264_WEIGHT(16, 8)
1011
H264_WEIGHT( 8,16)
1012
H264_WEIGHT( 8, 8)
1013
H264_WEIGHT( 8, 4)
1014
H264_WEIGHT( 4, 8)
1015
H264_WEIGHT( 4, 4)
1016
H264_WEIGHT( 4, 2)
1017

  
1018
void ff_h264_biweight_8x8_sse2(uint8_t *dst, uint8_t *src, int stride,
1019
                               int log2_denom, int weightd, int weights,
1020
                               int offset);
1021

  
1022
void ff_h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
1023
                                 int log2_denom, int weightd, int weights,
1024
                                 int offset);
1025

  
1026
void ff_h264_biweight_8x8_ssse3(uint8_t *dst, uint8_t *src, int stride,
1027
                                int log2_denom, int weightd, int weights,
1028
                                int offset);
1029

  
1030
void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride,
1031
                                  int log2_denom, int weightd, int weights,
1032
                                  int offset);
924
#define H264_WEIGHT(W, H, OPT) \
925
void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
926
    int stride, int log2_denom, int weight, int offset);
927

  
928
#define H264_BIWEIGHT(W, H, OPT) \
929
void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
930
    uint8_t *src, int stride, int log2_denom, int weightd, \
931
    int weights, int offset);
932

  
933
#define H264_BIWEIGHT_MMX(W,H) \
934
H264_WEIGHT  (W, H, mmx2) \
935
H264_BIWEIGHT(W, H, mmx2)
936

  
937
#define H264_BIWEIGHT_MMX_SSE(W,H) \
938
H264_BIWEIGHT_MMX(W, H) \
939
H264_WEIGHT      (W, H, sse2) \
940
H264_BIWEIGHT    (W, H, sse2) \
941
H264_BIWEIGHT    (W, H, ssse3)
942

  
943
H264_BIWEIGHT_MMX_SSE(16, 16)
944
H264_BIWEIGHT_MMX_SSE(16,  8)
945
H264_BIWEIGHT_MMX_SSE( 8, 16)
946
H264_BIWEIGHT_MMX_SSE( 8,  8)
947
H264_BIWEIGHT_MMX_SSE( 8,  4)
948
H264_BIWEIGHT_MMX    ( 4,  8)
949
H264_BIWEIGHT_MMX    ( 4,  4)
950
H264_BIWEIGHT_MMX    ( 4,  2)
1033 951

  
1034 952
void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
1035 953
void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
......
1076 994
            c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
1077 995
            c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
1078 996
            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
997
        }
998
        if(mm_flags & FF_MM_SSE2){
999
            c->h264_idct8_add = ff_h264_idct8_add_sse2;
1000
            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1001
        }
1079 1002

  
1003
#if HAVE_YASM
1004
        if (mm_flags & FF_MM_MMX2){
1005
#if ARCH_X86_32
1006
            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1007
            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1008
#endif
1080 1009
            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
1081 1010
            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
1082 1011
            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
......
1094 1023
            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
1095 1024
            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
1096 1025
            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
1097
        }
1098
        if(mm_flags & FF_MM_SSE2){
1099
            c->h264_idct8_add = ff_h264_idct8_add_sse2;
1100
            c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1101
        }
1102 1026

  
1103
#if HAVE_YASM
1104
        if (mm_flags & FF_MM_MMX2){
1105
#if ARCH_X86_32
1106
            c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1107
            c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1108
#endif
1109 1027
            if( mm_flags&FF_MM_SSE2 ){
1028
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
1029
                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
1030
                c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
1031
                c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
1032
                c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
1033

  
1110 1034
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
1035
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
1036
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
1111 1037
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
1038
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
1039

  
1112 1040
#if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
1113 1041
                c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
1114 1042
                c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
......
1123 1051
            }
1124 1052
            if ( mm_flags&FF_MM_SSSE3 ){
1125 1053
                c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
1054
                c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
1055
                c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
1126 1056
                c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
1057
                c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
1127 1058
            }
1128 1059
        }
1129 1060
#endif

Also available in: Unified diff