Revision 19fb234e

View differences:

libavcodec/dsputil.h
64 64
void ff_h264_idct8_add4_c(uint8_t *dst, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
65 65
void ff_h264_idct_add8_c(uint8_t **dest, const int *blockoffset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
66 66

  
67
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul);
68
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
69
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
70

  
67 71
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
68 72
                             const float *win, float add_bias, int len);
69 73
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
libavcodec/h264.c
246 246
    return 0;
247 247
}
248 248

  
249
/**
250
 * IDCT transforms the 16 dc values and dequantizes them.
251
 * @param qp quantization parameter
252
 */
253
static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
254
#define stride 16
255
    int i;
256
    int temp[16]; //FIXME check if this is a good idea
257
    static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
258
    static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
259

  
260
//memset(block, 64, 2*256);
261
//return;
262
    for(i=0; i<4; i++){
263
        const int offset= y_offset[i];
264
        const int z0= block[offset+stride*0] + block[offset+stride*4];
265
        const int z1= block[offset+stride*0] - block[offset+stride*4];
266
        const int z2= block[offset+stride*1] - block[offset+stride*5];
267
        const int z3= block[offset+stride*1] + block[offset+stride*5];
268

  
269
        temp[4*i+0]= z0+z3;
270
        temp[4*i+1]= z1+z2;
271
        temp[4*i+2]= z1-z2;
272
        temp[4*i+3]= z0-z3;
273
    }
274

  
275
    for(i=0; i<4; i++){
276
        const int offset= x_offset[i];
277
        const int z0= temp[4*0+i] + temp[4*2+i];
278
        const int z1= temp[4*0+i] - temp[4*2+i];
279
        const int z2= temp[4*1+i] - temp[4*3+i];
280
        const int z3= temp[4*1+i] + temp[4*3+i];
281

  
282
        block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
283
        block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
284
        block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
285
        block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
286
    }
287
}
288

  
289 249
#if 0
290 250
/**
291 251
 * DCT transforms the 16 dc values.
......
1245 1205
                h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
1246 1206
                if(is_h264){
1247 1207
                    if(!transform_bypass)
1248
                        h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
1208
                        h->h264dsp.h264_luma_dc_dequant_idct(h->mb, h->mb_luma_dc, h->dequant4_coeff[0][s->qscale][0]);
1209
                    else{
1210
                        static const uint8_t dc_mapping[16] = { 0*16, 1*16, 4*16, 5*16, 2*16, 3*16, 6*16, 7*16,
1211
                                                                8*16, 9*16,12*16,13*16,10*16,11*16,14*16,15*16};
1212
                        for(i = 0; i < 16; i++)
1213
                            h->mb[dc_mapping[i]] = h->mb_luma_dc[i];
1214
                    }
1249 1215
                }else
1250
                    ff_svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
1216
                    ff_svq3_luma_dc_dequant_idct_c(h->mb, h->mb_luma_dc, s->qscale);
1251 1217
            }
1252 1218
            if(h->deblocking_filter)
1253 1219
                xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
libavcodec/h264.h
406 406
    GetBitContext *inter_gb_ptr;
407 407

  
408 408
    DECLARE_ALIGNED(16, DCTELEM, mb)[16*24];
409
    DECLARE_ALIGNED(16, DCTELEM, mb_luma_dc)[16];
409 410
    DCTELEM mb_padding[256];        ///< as mb is addressed by scantable[i] and scantable is uint8_t we can either check that i is not too large or ensure that there is some unused stuff after mb
410 411

  
411 412
    /**
......
600 601

  
601 602
extern const uint8_t ff_h264_chroma_qp[52];
602 603

  
603
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
604

  
605
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
606

  
607 604
/**
608 605
 * Decode SEI
609 606
 */
libavcodec/h264_cabac.c
1597 1597
    s->current_picture.mb_type[mb_xy]= mb_type;
1598 1598

  
1599 1599
    if( cbp || IS_INTRA16x16( mb_type ) ) {
1600
        const uint8_t *scan, *scan8x8, *dc_scan;
1600
        const uint8_t *scan, *scan8x8;
1601 1601
        const uint32_t *qmul;
1602 1602

  
1603 1603
        if(IS_INTERLACED(mb_type)){
1604 1604
            scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
1605 1605
            scan= s->qscale ? h->field_scan : h->field_scan_q0;
1606
            dc_scan= luma_dc_field_scan;
1607 1606
        }else{
1608 1607
            scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
1609 1608
            scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
1610
            dc_scan= luma_dc_zigzag_scan;
1611 1609
        }
1612 1610

  
1613 1611
        // decode_cabac_mb_dqp
......
1642 1640
        if( IS_INTRA16x16( mb_type ) ) {
1643 1641
            int i;
1644 1642
            //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
1645
            decode_cabac_residual_dc( h, h->mb, 0, 0, dc_scan, 16);
1643
            AV_ZERO128(h->mb_luma_dc+0);
1644
            AV_ZERO128(h->mb_luma_dc+8);
1645
            decode_cabac_residual_dc( h, h->mb_luma_dc, 0, 0, scan, 16);
1646 1646

  
1647 1647
            if( cbp&15 ) {
1648 1648
                qmul = h->dequant4_coeff[0][s->qscale];
libavcodec/h264_cavlc.c
911 911
        int i8x8, i4x4, chroma_idx;
912 912
        int dquant;
913 913
        GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
914
        const uint8_t *scan, *scan8x8, *dc_scan;
914
        const uint8_t *scan, *scan8x8;
915 915

  
916 916
        if(IS_INTERLACED(mb_type)){
917 917
            scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
918 918
            scan= s->qscale ? h->field_scan : h->field_scan_q0;
919
            dc_scan= luma_dc_field_scan;
920 919
        }else{
921 920
            scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
922 921
            scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
923
            dc_scan= luma_dc_zigzag_scan;
924 922
        }
925 923

  
926 924
        dquant= get_se_golomb(&s->gb);
......
939 937
        h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
940 938
        h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
941 939
        if(IS_INTRA16x16(mb_type)){
942
            if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
940
            AV_ZERO128(h->mb_luma_dc+0);
941
            AV_ZERO128(h->mb_luma_dc+8);
942
            if( decode_residual(h, h->intra_gb_ptr, h->mb_luma_dc, LUMA_DC_BLOCK_INDEX, scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
943 943
                return -1; //FIXME continue if partitioned and other return -1 too
944 944
            }
945 945

  
libavcodec/h264dsp.c
282 282
    c->h264_idct8_add4     = ff_h264_idct8_add4_c;
283 283
    c->h264_idct_add8      = ff_h264_idct_add8_c;
284 284
    c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
285
    c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_c;
285 286

  
286 287
    c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
287 288
    c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
libavcodec/h264dsp.h
65 65
    void (*h264_idct8_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
66 66
    void (*h264_idct_dc_add)(uint8_t *dst/*align 4*/, DCTELEM *block/*align 16*/, int stride);
67 67
    void (*h264_idct8_dc_add)(uint8_t *dst/*align 8*/, DCTELEM *block/*align 16*/, int stride);
68

  
68 69
    void (*h264_dct)(DCTELEM block[4][4]);
69 70
    void (*h264_idct_add16)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
70 71
    void (*h264_idct8_add4)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
71 72
    void (*h264_idct_add8)(uint8_t **dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
72 73
    void (*h264_idct_add16intra)(uint8_t *dst/*align 16*/, const int *blockoffset, DCTELEM *block/*align 16*/, int stride, const uint8_t nnzc[6*8]);
74
    void (*h264_luma_dc_dequant_idct)(DCTELEM *output, DCTELEM *input/*align 16*/, int qmul);
73 75
}H264DSPContext;
74 76

  
75 77
void ff_h264dsp_init(H264DSPContext *c);
libavcodec/h264idct.c
216 216
            ff_h264_idct_dc_add_c(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
217 217
    }
218 218
}
219
/**
220
 * IDCT transforms the 16 dc values and dequantizes them.
221
 * @param qp quantization parameter
222
 */
223
void ff_h264_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qmul){
224
#define stride 16
225
    int i;
226
    int temp[16];
227
    static const uint8_t x_offset[4]={0, 2*stride, 8*stride, 10*stride};
228

  
229
    for(i=0; i<4; i++){
230
        const int z0= input[4*i+0] + input[4*i+1];
231
        const int z1= input[4*i+0] - input[4*i+1];
232
        const int z2= input[4*i+2] - input[4*i+3];
233
        const int z3= input[4*i+2] + input[4*i+3];
234

  
235
        temp[4*i+0]= z0+z3;
236
        temp[4*i+1]= z0-z3;
237
        temp[4*i+2]= z1-z2;
238
        temp[4*i+3]= z1+z2;
239
    }
240

  
241
    for(i=0; i<4; i++){
242
        const int offset= x_offset[i];
243
        const int z0= temp[4*0+i] + temp[4*2+i];
244
        const int z1= temp[4*0+i] - temp[4*2+i];
245
        const int z2= temp[4*1+i] - temp[4*3+i];
246
        const int z3= temp[4*1+i] + temp[4*3+i];
247

  
248
        output[stride* 0+offset]= ((((z0 + z3)*qmul + 128 ) >> 8));
249
        output[stride* 1+offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
250
        output[stride* 4+offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
251
        output[stride* 5+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
252
    }
253
}
libavcodec/svq3.c
126 126
};
127 127

  
128 128

  
129
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp)
129
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp)
130 130
{
131 131
    const int qmul = svq3_dequant_coeff[qp];
132 132
#define stride 16
133 133
    int i;
134 134
    int temp[16];
135 135
    static const int x_offset[4] = {0, 1*stride, 4* stride,  5*stride};
136
    static const int y_offset[4] = {0, 2*stride, 8* stride, 10*stride};
137 136

  
138 137
    for (i = 0; i < 4; i++){
139
        const int offset = y_offset[i];
140
        const int z0 = 13*(block[offset+stride*0] +    block[offset+stride*4]);
141
        const int z1 = 13*(block[offset+stride*0] -    block[offset+stride*4]);
142
        const int z2 =  7* block[offset+stride*1] - 17*block[offset+stride*5];
143
        const int z3 = 17* block[offset+stride*1] +  7*block[offset+stride*5];
138
        const int z0= 13*(input[4*i+0] +    input[4*i+1]);
139
        const int z1= 13*(input[4*i+0] -    input[4*i+1]);
140
        const int z2=  7* input[4*i+2] - 17*input[4*i+3];
141
        const int z3= 17* input[4*i+2] +  7*input[4*i+3];
144 142

  
145 143
        temp[4*i+0] = z0+z3;
146 144
        temp[4*i+1] = z1+z2;
......
155 153
        const int z2 =  7* temp[4*1+i] - 17*temp[4*3+i];
156 154
        const int z3 = 17* temp[4*1+i] +  7*temp[4*3+i];
157 155

  
158
        block[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
159
        block[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
160
        block[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
161
        block[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
156
        output[stride*0 +offset] = ((z0 + z3)*qmul + 0x80000) >> 20;
157
        output[stride*2 +offset] = ((z1 + z2)*qmul + 0x80000) >> 20;
158
        output[stride*8 +offset] = ((z1 - z2)*qmul + 0x80000) >> 20;
159
        output[stride*10+offset] = ((z0 - z3)*qmul + 0x80000) >> 20;
162 160
    }
163 161
}
164 162
#undef stride
libavcodec/x86/dsputil_mmx.c
41 41
DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
42 42
{0x8000000080000000ULL, 0x8000000080000000ULL};
43 43

  
44
DECLARE_ALIGNED(8,  const uint64_t, ff_pw_1  ) = 0x0001000100010001ULL;
44 45
DECLARE_ALIGNED(8,  const xmm_reg,  ff_pw_3  ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
45 46
DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_4  ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
46 47
DECLARE_ALIGNED(16, const xmm_reg,  ff_pw_5  ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
libavcodec/x86/h264_idct.asm
47 47
%endif
48 48

  
49 49
cextern pw_32
50
cextern pw_1
50 51

  
51 52
SECTION .text
52 53

  
......
854 855
    add8_sse2_cycle 2, 0x21
855 856
    add8_sse2_cycle 3, 0x29
856 857
    RET
858

  
859
;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
860

  
861
%macro WALSH4_1D 5
862
    SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
863
    SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
864
    SWAP %1, %4, %3
865
%endmacro
866

  
867
%macro DEQUANT_MMX 3
868
    mova        m7, [pw_1]
869
    mova        m4, %1
870
    punpcklwd   %1, m7
871
    punpckhwd   m4, m7
872
    mova        m5, %2
873
    punpcklwd   %2, m7
874
    punpckhwd   m5, m7
875
    movd        m7, t3d
876
    punpckldq   m7, m7
877
    pmaddwd     %1, m7
878
    pmaddwd     %2, m7
879
    pmaddwd     m4, m7
880
    pmaddwd     m5, m7
881
    psrad       %1, %3
882
    psrad       %2, %3
883
    psrad       m4, %3
884
    psrad       m5, %3
885
    packssdw    %1, m4
886
    packssdw    %2, m5
887
%endmacro
888

  
889
%macro STORE_WORDS_MMX 5
890
    movd  t0d, %1
891
    psrlq  %1, 32
892
    movd  t1d, %1
893
    mov [t2+%2*32], t0w
894
    mov [t2+%4*32], t1w
895
    shr   t0d, 16
896
    shr   t1d, 16
897
    mov [t2+%3*32], t0w
898
    mov [t2+%5*32], t1w
899
%endmacro
900

  
901
%macro DEQUANT_STORE_MMX 1
902
    DEQUANT_MMX m0, m1, %1
903
    STORE_WORDS_MMX m0,  0,  1,  4,  5
904
    STORE_WORDS_MMX m1,  2,  3,  6,  7
905

  
906
    DEQUANT_MMX m2, m3, %1
907
    STORE_WORDS_MMX m2,  8,  9, 12, 13
908
    STORE_WORDS_MMX m3, 10, 11, 14, 15
909
%endmacro
910

  
911
%macro STORE_WORDS_SSE 9
912
    movd  t0d, %1
913
    psrldq  %1, 4
914
    movd  t1d, %1
915
    psrldq  %1, 4
916
    mov [t2+%2*32], t0w
917
    mov [t2+%4*32], t1w
918
    shr   t0d, 16
919
    shr   t1d, 16
920
    mov [t2+%3*32], t0w
921
    mov [t2+%5*32], t1w
922
    movd  t0d, %1
923
    psrldq  %1, 4
924
    movd  t1d, %1
925
    mov [t2+%6*32], t0w
926
    mov [t2+%8*32], t1w
927
    shr   t0d, 16
928
    shr   t1d, 16
929
    mov [t2+%7*32], t0w
930
    mov [t2+%9*32], t1w
931
%endmacro
932

  
933
%macro DEQUANT_STORE_SSE2 1
934
    movd      xmm4, t3d
935
    movq      xmm5, [pw_1]
936
    pshufd    xmm4, xmm4, 0
937
    movq2dq   xmm0, m0
938
    movq2dq   xmm1, m1
939
    movq2dq   xmm2, m2
940
    movq2dq   xmm3, m3
941
    punpcklwd xmm0, xmm5
942
    punpcklwd xmm1, xmm5
943
    punpcklwd xmm2, xmm5
944
    punpcklwd xmm3, xmm5
945
    pmaddwd   xmm0, xmm4
946
    pmaddwd   xmm1, xmm4
947
    pmaddwd   xmm2, xmm4
948
    pmaddwd   xmm3, xmm4
949
    psrad     xmm0, %1
950
    psrad     xmm1, %1
951
    psrad     xmm2, %1
952
    psrad     xmm3, %1
953
    packssdw  xmm0, xmm1
954
    packssdw  xmm2, xmm3
955
    STORE_WORDS_SSE xmm0,  0,  1,  4,  5,  2,  3,  6,  7
956
    STORE_WORDS_SSE xmm2,  8,  9, 12, 13, 10, 11, 14, 15
957
%endmacro
958

  
959
%macro IDCT_DC_DEQUANT 2
960
cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
961
    movq        m3, [r1+24]
962
    movq        m2, [r1+16]
963
    movq        m1, [r1+ 8]
964
    movq        m0, [r1+ 0]
965
    WALSH4_1D    0,1,2,3,4
966
    TRANSPOSE4x4W 0,1,2,3,4
967
    WALSH4_1D    0,1,2,3,4
968

  
969
; shift, tmp, output, qmul
970
%ifdef WIN64
971
    DECLARE_REG_TMP 0,3,1,2
972
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
973
    xchg        r0, t2
974
%elifdef ARCH_X86_64
975
    DECLARE_REG_TMP 3,1,0,2
976
%else
977
    DECLARE_REG_TMP 1,3,0,2
978
%endif
979

  
980
    cmp        t3d, 32767
981
    jg .big_qmul
982
    add        t3d, 128 << 16
983
%ifidn %1,mmx
984
    DEQUANT_STORE_MMX 8
985
%else
986
    DEQUANT_STORE_SSE2 8
987
%endif
988
    RET
989
.big_qmul:
990
    bsr        t0d, t3d
991
    add        t3d, 128 << 16
992
    mov        t1d, 7
993
    cmp        t0d, t1d
994
    cmovg      t0d, t1d
995
    inc        t1d
996
    shr        t3d, t0b
997
    sub        t1d, t0d
998
%ifidn %1,mmx
999
    movd        m6, t1d
1000
    DEQUANT_STORE_MMX m6
1001
%else
1002
    movd      xmm6, t1d
1003
    DEQUANT_STORE_SSE2 xmm6
1004
%endif
1005
    RET
1006
%endmacro
1007

  
1008
INIT_MMX
1009
IDCT_DC_DEQUANT mmx, 0
1010
IDCT_DC_DEQUANT sse2, 7
libavcodec/x86/h264dsp_mmx.c
59 59
                                  int stride, const uint8_t nnzc[6*8]);
60 60
void ff_h264_idct_add8_sse2      (uint8_t **dest, const int *block_offset, DCTELEM *block,
61 61
                                  int stride, const uint8_t nnzc[6*8]);
62
void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
63
void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
62 64

  
63 65
/***********************************/
64 66
/* deblocking */
......
301 303
        c->h264_idct8_add4     = ff_h264_idct8_add4_mmx;
302 304
        c->h264_idct_add8      = ff_h264_idct_add8_mmx;
303 305
        c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
306
        c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
304 307

  
305 308
        if (mm_flags & AV_CPU_FLAG_MMX2) {
306 309
            c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
......
341 344
            if (mm_flags&AV_CPU_FLAG_SSE2) {
342 345
                c->h264_idct8_add = ff_h264_idct8_add_sse2;
343 346
                c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
347
                c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
344 348

  
345 349
                c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
346 350
                c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;

Also available in: Unified diff