Revision e6e98234

View differences:

libavcodec/ac3enc.c
167 167
static void mdct512(AC3MDCTContext *mdct, CoefType *out, SampleType *in);
168 168

  
169 169
static void apply_window(DSPContext *dsp, SampleType *output, const SampleType *input,
170
                         const SampleType *window, int n);
170
                         const SampleType *window, unsigned int len);
171 171

  
172 172
static int normalize_samples(AC3EncodeContext *s);
173 173

  
libavcodec/ac3enc_fixed.c
252 252
 * Apply KBD window to input samples prior to MDCT.
253 253
 */
254 254
static void apply_window(DSPContext *dsp, int16_t *output, const int16_t *input,
255
                         const int16_t *window, int n)
255
                         const int16_t *window, unsigned int len)
256 256
{
257
    int i;
258
    int n2 = n >> 1;
259

  
260
    for (i = 0; i < n2; i++) {
261
        output[i]     = MUL16(input[i],     window[i]) >> 15;
262
        output[n-i-1] = MUL16(input[n-i-1], window[i]) >> 15;
263
    }
257
    dsp->apply_window_int16(output, input, window, len);
264 258
}
265 259

  
266 260

  
libavcodec/ac3enc_float.c
83 83
 * Apply KBD window to input samples prior to MDCT.
84 84
 */
85 85
static void apply_window(DSPContext *dsp, float *output, const float *input,
86
                         const float *window, int n)
86
                         const float *window, unsigned int len)
87 87
{
88
    dsp->vector_fmul(output, input, window, n);
88
    dsp->vector_fmul(output, input, window, len);
89 89
}
90 90

  
91 91

  
libavcodec/ac3tab.c
141 141
/* AC-3 MDCT window */
142 142

  
143 143
/* MDCT window */
144
const int16_t ff_ac3_window[AC3_WINDOW_SIZE/2] = {
144
DECLARE_ALIGNED(16, const int16_t, ff_ac3_window)[AC3_WINDOW_SIZE/2] = {
145 145
    4,    7,   12,   16,   21,   28,   34,   42,
146 146
   51,   61,   72,   84,   97,  111,  127,  145,
147 147
  164,  184,  207,  231,  257,  285,  315,  347,
libavcodec/dsputil.c
3890 3890
    return res;
3891 3891
}
3892 3892

  
3893
static void apply_window_int16_c(int16_t *output, const int16_t *input,
3894
                                 const int16_t *window, unsigned int len)
3895
{
3896
    int i;
3897
    int len2 = len >> 1;
3898

  
3899
    for (i = 0; i < len2; i++) {
3900
        int16_t w       = window[i];
3901
        output[i]       = (MUL16(input[i],       w) + (1 << 14)) >> 15;
3902
        output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
3903
    }
3904
}
3905

  
3893 3906
#define W0 2048
3894 3907
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3895 3908
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
......
4364 4377
    c->vector_clipf = vector_clipf_c;
4365 4378
    c->scalarproduct_int16 = scalarproduct_int16_c;
4366 4379
    c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4380
    c->apply_window_int16 = apply_window_int16_c;
4367 4381
    c->scalarproduct_float = scalarproduct_float_c;
4368 4382
    c->butterflies_float = butterflies_float_c;
4369 4383
    c->vector_fmul_scalar = vector_fmul_scalar_c;
libavcodec/dsputil.h
524 524
     */
525 525
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
526 526

  
527
    /**
528
     * Apply symmetric window in 16-bit fixed-point.
529
     * @param output destination array
530
     *               constraints: 16-byte aligned
531
     * @param input  source array
532
     *               constraints: 16-byte aligned
533
     * @param window window array
534
     *               constraints: 16-byte aligned, at least len/2 elements
535
     * @param len    full window length
536
     *               constraints: multiple of ? greater than zero
537
     */
538
    void (*apply_window_int16)(int16_t *output, const int16_t *input,
539
                               const int16_t *window, unsigned int len);
540

  
527 541
    /* rv30 functions */
528 542
    qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
529 543
    qpel_mc_func avg_rv30_tpel_pixels_tab[4][16];
libavcodec/x86/dsputil_mmx.c
2388 2388
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2389 2389
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2390 2390
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2391

  
2392
void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
2393
                                      const int16_t *window, unsigned int len);
2394
void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2395
                                      const int16_t *window, unsigned int len);
2396
void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
2397
                                      const int16_t *window, unsigned int len);
2398
void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
2399
                                      const int16_t *window, unsigned int len);
2400
void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
2401
                                      const int16_t *window, unsigned int len);
2402
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2403
                                      const int16_t *window, unsigned int len);
2404

  
2391 2405
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2392 2406
int  ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2393 2407
int  ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
......
2749 2763
#if HAVE_YASM
2750 2764
            c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2751 2765
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2766
            if (avctx->flags & CODEC_FLAG_BITEXACT) {
2767
                c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2768
            } else {
2769
                c->apply_window_int16 = ff_apply_window_int16_mmxext;
2770
            }
2752 2771
#endif
2753 2772
        }
2754 2773
        if(mm_flags & AV_CPU_FLAG_SSE){
......
2771 2790
#if HAVE_YASM
2772 2791
            c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2773 2792
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2793
            if (avctx->flags & CODEC_FLAG_BITEXACT) {
2794
                c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2795
            } else {
2796
                if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2797
                    c->apply_window_int16 = ff_apply_window_int16_sse2;
2798
                }
2799
            }
2774 2800

  
2775 2801
            c->emulated_edge_mc = emulated_edge_mc_sse;
2776 2802
            c->gmc= gmc_sse;
2777 2803
#endif
2778 2804
        }
2779
        if((mm_flags & AV_CPU_FLAG_SSSE3) && !(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)) && HAVE_YASM) // cachesplit
2780
            c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2805
        if (mm_flags & AV_CPU_FLAG_SSSE3) {
2806
#if HAVE_YASM
2807
            if (mm_flags & AV_CPU_FLAG_ATOM) {
2808
                c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2809
            } else {
2810
                c->apply_window_int16 = ff_apply_window_int16_ssse3;
2811
            }
2812
            if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2813
                c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2814
            }
2815
#endif
2816
        }
2781 2817
    }
2782 2818

  
2783 2819
    if (CONFIG_ENCODERS)
libavcodec/x86/dsputil_yasm.asm
27 27
pb_7: times 8 db 7
28 28
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
29 29
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
30
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
31
pd_16384: times 4 dd 16384
30 32

  
31 33
section .text align=16
32 34

  
......
202 204
    RET
203 205

  
204 206

  
207
;-----------------------------------------------------------------------------
208
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
209
;                            const int16_t *window, unsigned int len)
210
;-----------------------------------------------------------------------------
211

  
212
%macro REVERSE_WORDS_MMXEXT 1-2
213
    pshufw   %1, %1, 0x1B
214
%endmacro
215

  
216
%macro REVERSE_WORDS_SSE2 1-2
217
    pshuflw  %1, %1, 0x1B
218
    pshufhw  %1, %1, 0x1B
219
    pshufd   %1, %1, 0x4E
220
%endmacro
221

  
222
%macro REVERSE_WORDS_SSSE3 2
223
    pshufb  %1, %2
224
%endmacro
225

  
226
; dst = (dst * src) >> 15
227
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
228
; in from the pmullw result.
229
%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
230
    mova    %3, %1
231
    pmulhw  %1, %2
232
    pmullw  %3, %2
233
    psrlw   %3, 15
234
    psllw   %1, 1
235
    por     %1, %3
236
%endmacro
237

  
238
; dst = ((dst * src) + (1<<14)) >> 15
239
%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
240
    pmulhrsw   %1, %2
241
%endmacro
242

  
243
%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
244
cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
245
    lea     offset2q, [offsetq-mmsize]
246
%if %2
247
    mova          m5, [pd_16384]
248
%elifidn %1, ssse3
249
    mova          m5, [pb_revwords]
250
    ALIGN 16
251
%endif
252
.loop:
253
%if %2
254
    ; This version expands 16-bit to 32-bit, multiplies by the window,
255
    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
256
    ; save to the output. The window is reversed for the second half.
257
    mova          m3, [windowq+offset2q]
258
    mova          m4, [ inputq+offset2q]
259
    pxor          m0, m0
260
    punpcklwd     m0, m3
261
    punpcklwd     m1, m4
262
    pmaddwd       m0, m1
263
    paddd         m0, m5
264
    psrad         m0, 15
265
    pxor          m2, m2
266
    punpckhwd     m2, m3
267
    punpckhwd     m1, m4
268
    pmaddwd       m2, m1
269
    paddd         m2, m5
270
    psrad         m2, 15
271
    packssdw      m0, m2
272
    mova  [outputq+offset2q], m0
273
    REVERSE_WORDS m3
274
    mova          m4, [ inputq+offsetq]
275
    pxor          m0, m0
276
    punpcklwd     m0, m3
277
    punpcklwd     m1, m4
278
    pmaddwd       m0, m1
279
    paddd         m0, m5
280
    psrad         m0, 15
281
    pxor          m2, m2
282
    punpckhwd     m2, m3
283
    punpckhwd     m1, m4
284
    pmaddwd       m2, m1
285
    paddd         m2, m5
286
    psrad         m2, 15
287
    packssdw      m0, m2
288
    mova  [outputq+offsetq], m0
289
%elif %3
290
    ; This version does the 16x16->16 multiplication in-place without expanding
291
    ; to 32-bit. The ssse3 version is bit-identical.
292
    mova          m0, [windowq+offset2q]
293
    mova          m1, [ inputq+offset2q]
294
    pmulhrsw      m1, m0
295
    REVERSE_WORDS m0, m5
296
    pmulhrsw      m0, [ inputq+offsetq ]
297
    mova  [outputq+offset2q], m1
298
    mova  [outputq+offsetq ], m0
299
%else
300
    ; This version does the 16x16->16 multiplication in-place without expanding
301
    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
302
    ; therefore are not bit-identical to the C version.
303
    mova          m0, [windowq+offset2q]
304
    mova          m1, [ inputq+offset2q]
305
    mova          m2, [ inputq+offsetq ]
306
    MUL16FIXED    m1, m0, m3
307
    REVERSE_WORDS m0
308
    MUL16FIXED    m2, m0, m3
309
    mova  [outputq+offset2q], m1
310
    mova  [outputq+offsetq ], m2
311
%endif
312
    add      offsetd, mmsize
313
    sub     offset2d, mmsize
314
    jae .loop
315
    REP_RET
316
%endmacro
317

  
318
INIT_MMX
319
%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
320
%define MUL16FIXED MUL16FIXED_MMXEXT
321
APPLY_WINDOW_INT16 mmxext,     0, 0
322
APPLY_WINDOW_INT16 mmxext_ba,  1, 0
323
INIT_XMM
324
%define REVERSE_WORDS REVERSE_WORDS_SSE2
325
APPLY_WINDOW_INT16 sse2,       0, 0
326
APPLY_WINDOW_INT16 sse2_ba,    1, 0
327
APPLY_WINDOW_INT16 ssse3_atom, 0, 1
328
%define REVERSE_WORDS REVERSE_WORDS_SSSE3
329
APPLY_WINDOW_INT16 ssse3,      0, 1
330

  
205 331

  
206 332
; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
207 333
cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
tests/ref/acodec/ac3_fixed
1
b3a8f0a8809a58b2ece90744f06fff96 *./tests/data/acodec/ac3.rm
1
346073c97eada69330f61e103a170ca1 *./tests/data/acodec/ac3.rm
2 2
98751 ./tests/data/acodec/ac3.rm
tests/ref/lavf/rm
1
7da378131db880bcf2e58305d54418ec *./tests/data/lavf/lavf.rm
1
7b7ede9548a09346675edad36acfbf19 *./tests/data/lavf/lavf.rm
2 2
346706 ./tests/data/lavf/lavf.rm
tests/ref/seek/ac3_rm
1 1
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
2 2
ret: 0         st:-1 flags:0  ts:-1.000000
3 3
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
4
ret: 0         st:-1 flags:1  ts: 1.894167
5
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
6
ret: 0         st: 0 flags:0  ts: 0.788000
7
ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
4
ret:-1         st:-1 flags:1  ts: 1.894167
5
ret:-1         st: 0 flags:0  ts: 0.788000
8 6
ret: 0         st: 0 flags:1  ts:-0.317000
9 7
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
10
ret: 0         st:-1 flags:0  ts: 2.576668
11
ret: 0         st: 0 flags:1 dts:524.800000 pts:524.800000 pos:   6155 size:   244
8
ret:-1         st:-1 flags:0  ts: 2.576668
12 9
ret:-1         st:-1 flags:1  ts: 1.470835
13
ret: 0         st: 0 flags:0  ts: 0.365000
14
ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
10
ret:-1         st: 0 flags:0  ts: 0.365000
15 11
ret: 0         st: 0 flags:1  ts:-0.741000
16 12
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
17 13
ret:-1         st:-1 flags:0  ts: 2.153336
18
ret: 0         st:-1 flags:1  ts: 1.047503
19
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
14
ret:-1         st:-1 flags:1  ts: 1.047503
20 15
ret: 0         st: 0 flags:0  ts:-0.058000
21 16
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
22
ret: 0         st: 0 flags:1  ts: 2.836000
23
ret: 0         st: 0 flags:1 dts: 2.681000 pts: 2.681000 pos:  44105 size:   558
17
ret:-1         st: 0 flags:1  ts: 2.836000
24 18
ret:-1         st:-1 flags:0  ts: 1.730004
25
ret: 0         st:-1 flags:1  ts: 0.624171
26
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
19
ret:-1         st:-1 flags:1  ts: 0.624171
27 20
ret: 0         st: 0 flags:0  ts:-0.482000
28 21
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
29 22
ret:-1         st: 0 flags:1  ts: 2.413000
30 23
ret:-1         st:-1 flags:0  ts: 1.306672
31
ret: 0         st:-1 flags:1  ts: 0.200839
32
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
24
ret:-1         st:-1 flags:1  ts: 0.200839
33 25
ret: 0         st: 0 flags:0  ts:-0.905000
34 26
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
35 27
ret:-1         st: 0 flags:1  ts: 1.989000
36
ret: 0         st:-1 flags:0  ts: 0.883340
37
ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
28
ret:-1         st:-1 flags:0  ts: 0.883340
38 29
ret: 0         st:-1 flags:1  ts:-0.222493
39 30
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556
40 31
ret:-1         st: 0 flags:0  ts: 2.672000
41 32
ret:-1         st: 0 flags:1  ts: 1.566000
42
ret: 0         st:-1 flags:0  ts: 0.460008
43
ret: 0         st: 0 flags:1 dts:12581.487000 pts:12581.487000 pos:   5822 size:   916
33
ret:-1         st:-1 flags:0  ts: 0.460008
44 34
ret: 0         st:-1 flags:1  ts:-0.645825
45 35
ret: 0         st: 0 flags:1 dts: 0.000000 pts: 0.000000 pos:    271 size:   556

Also available in: Unified diff