Revision 80ba1ddb

View differences:

libavcodec/aacdec.c
1721 1721
     */
1722 1722
    if ((ics->window_sequence[1] == ONLY_LONG_SEQUENCE || ics->window_sequence[1] == LONG_STOP_SEQUENCE) &&
1723 1723
            (ics->window_sequence[0] == ONLY_LONG_SEQUENCE || ics->window_sequence[0] == LONG_START_SEQUENCE)) {
1724
        ac->dsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 0, 512);
1724
        ac->dsp.vector_fmul_window(    out,               saved,            buf,         lwindow_prev, 512);
1725 1725
    } else {
1726 1726
        memcpy(                        out,               saved,            448 * sizeof(float));
1727 1727

  
1728 1728
        if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
1729
            ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 0, 64);
1730
            ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      0, 64);
1731
            ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      0, 64);
1732
            ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      0, 64);
1733
            ac->dsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      0, 64);
1729
            ac->dsp.vector_fmul_window(out + 448 + 0*128, saved + 448,      buf + 0*128, swindow_prev, 64);
1730
            ac->dsp.vector_fmul_window(out + 448 + 1*128, buf + 0*128 + 64, buf + 1*128, swindow,      64);
1731
            ac->dsp.vector_fmul_window(out + 448 + 2*128, buf + 1*128 + 64, buf + 2*128, swindow,      64);
1732
            ac->dsp.vector_fmul_window(out + 448 + 3*128, buf + 2*128 + 64, buf + 3*128, swindow,      64);
1733
            ac->dsp.vector_fmul_window(temp,              buf + 3*128 + 64, buf + 4*128, swindow,      64);
1734 1734
            memcpy(                    out + 448 + 4*128, temp, 64 * sizeof(float));
1735 1735
        } else {
1736
            ac->dsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 0, 64);
1736
            ac->dsp.vector_fmul_window(out + 448,         saved + 448,      buf,         swindow_prev, 64);
1737 1737
            memcpy(                    out + 576,         buf + 64,         448 * sizeof(float));
1738 1738
        }
1739 1739
    }
......
1741 1741
    // buffer update
1742 1742
    if (ics->window_sequence[0] == EIGHT_SHORT_SEQUENCE) {
1743 1743
        memcpy(                    saved,       temp + 64,         64 * sizeof(float));
1744
        ac->dsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 0, 64);
1745
        ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 0, 64);
1746
        ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 0, 64);
1744
        ac->dsp.vector_fmul_window(saved + 64,  buf + 4*128 + 64, buf + 5*128, swindow, 64);
1745
        ac->dsp.vector_fmul_window(saved + 192, buf + 5*128 + 64, buf + 6*128, swindow, 64);
1746
        ac->dsp.vector_fmul_window(saved + 320, buf + 6*128 + 64, buf + 7*128, swindow, 64);
1747 1747
        memcpy(                    saved + 448, buf + 7*128 + 64,  64 * sizeof(float));
1748 1748
    } else if (ics->window_sequence[0] == LONG_START_SEQUENCE) {
1749 1749
        memcpy(                    saved,       buf + 512,        448 * sizeof(float));
libavcodec/ac3dec.c
628 628
            for(i=0; i<128; i++)
629 629
                x[i] = s->transform_coeffs[ch][2*i];
630 630
            ff_imdct_half(&s->imdct_256, s->tmp_output, x);
631
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 0, 128);
631
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 128);
632 632
            for(i=0; i<128; i++)
633 633
                x[i] = s->transform_coeffs[ch][2*i+1];
634 634
            ff_imdct_half(&s->imdct_256, s->delay[ch-1], x);
635 635
        } else {
636 636
            ff_imdct_half(&s->imdct_512, s->tmp_output, s->transform_coeffs[ch]);
637
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 0, 128);
637
            s->dsp.vector_fmul_window(s->output[ch-1], s->delay[ch-1], s->tmp_output, s->window, 128);
638 638
            memcpy(s->delay[ch-1], s->tmp_output+128, 128*sizeof(float));
639 639
        }
640 640
    }
libavcodec/arm/dcadsp_init_arm.c
23 23
#include "libavcodec/dcadsp.h"
24 24

  
25 25
void ff_dca_lfe_fir_neon(float *out, const float *in, const float *coefs,
26
                         int decifactor, float scale, float bias);
26
                         int decifactor, float scale);
27 27

  
28 28
void av_cold ff_dcadsp_init_arm(DCADSPContext *s)
29 29
{
libavcodec/arm/dcadsp_neon.S
29 29
        cmp             r3,  #32
30 30
        moveq           r6,  #256/32
31 31
        movne           r6,  #256/64
32
NOVFP   vldr            d0,  [sp, #16]          @ scale, bias
32
NOVFP   vldr            s0,  [sp, #16]          @ scale
33 33
        mov             lr,  #-16
34 34
1:
35 35
        vmov.f32        q2,  #0.0               @ v0
......
51 51
        vadd.f32        d4,  d4,  d5
52 52
        vadd.f32        d6,  d6,  d7
53 53
        vpadd.f32       d4,  d4,  d6
54
        vdup.32         d5,  d0[1]
55
        vmla.f32        d5,  d4,  d0[0]
54
        vmul.f32        d5,  d4,  d0[0]
56 55
        vst1.32         {d5[0]},  [r0,:32]!
57 56
        vst1.32         {d5[1]},  [r4,:32]!
58 57
        bne             1b
libavcodec/arm/dsputil_init_neon.c
140 140

  
141 141
void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
142 142
void ff_vector_fmul_window_neon(float *dst, const float *src0,
143
                                const float *src1, const float *win,
144
                                float add_bias, int len);
143
                                const float *src1, const float *win, int len);
145 144
void ff_vector_fmul_scalar_neon(float *dst, const float *src, float mul,
146 145
                                int len);
147 146
void ff_vector_fmul_sv_scalar_2_neon(float *dst, const float *src,
libavcodec/arm/dsputil_neon.S
777 777
endfunc
778 778

  
779 779
function ff_vector_fmul_window_neon, export=1
780
VFP     vdup.32         q8,  d0[0]
781
NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
782 780
        push            {r4,r5,lr}
783
VFP     ldr             lr,  [sp, #12]
784
NOVFP   ldr             lr,  [sp, #16]
781
        ldr             lr,  [sp, #12]
785 782
        sub             r2,  r2,  #8
786 783
        sub             r5,  lr,  #2
787 784
        add             r2,  r2,  r5, lsl #2
......
793 790
        vld1.64         {d4,d5},  [r3,:128]!
794 791
        vld1.64         {d6,d7},  [r4,:128], r5
795 792
1:      subs            lr,  lr,  #4
796
        vmov            q11, q8
797
        vmla.f32        d22, d0,  d4
798
        vmov            q10, q8
799
        vmla.f32        d23, d1,  d5
793
        vmul.f32        d22, d0,  d4
800 794
        vrev64.32       q3,  q3
801
        vmla.f32        d20, d0,  d7
795
        vmul.f32        d23, d1,  d5
802 796
        vrev64.32       q1,  q1
803
        vmla.f32        d21, d1,  d6
797
        vmul.f32        d20, d0,  d7
798
        vmul.f32        d21, d1,  d6
804 799
        beq             2f
805 800
        vmla.f32        d22, d3,  d7
806 801
        vld1.64         {d0,d1},  [r1,:128]!
libavcodec/arm/fft_init_arm.c
34 34
                                float *synth_buf_ptr, int *synth_buf_offset,
35 35
                                float synth_buf2[32], const float window[512],
36 36
                                float out[32], const float in[32],
37
                                float scale, float bias);
37
                                float scale);
38 38

  
39 39
av_cold void ff_fft_init_arm(FFTContext *s)
40 40
{
libavcodec/arm/synth_filter_neon.S
42 42

  
43 43
        ldr             r5,  [sp, #9*4]         @ window
44 44
        ldr             r2,  [sp, #10*4]        @ out
45
NOVFP   vldr            d0,  [sp, #12*4]        @ scale, bias
45
NOVFP   vldr            s0,  [sp, #12*4]        @ scale
46 46
        add             r8,  r9,  #12*4
47 47

  
48 48
        mov             lr,  #64*4
......
90 90
        sub             r11, r11, #512*4
91 91
        b               2b
92 92
3:
93
        vdup.32         q8,  d0[1]
94
        vdup.32         q9,  d0[1]
95
        vmla.f32        q8,  q10, d0[0]
96
        vmla.f32        q9,  q1,  d0[0]
93
        vmul.f32        q8,  q10, d0[0]
94
        vmul.f32        q9,  q1,  d0[0]
97 95
        vst1.32         {q3},     [r3,:128]
98 96
        sub             r3,  r3,  #16*4
99 97
        vst1.32         {q2},     [r3,:128]
libavcodec/atrac1.c
141 141

  
142 142
            /* overlap and window */
143 143
            q->dsp.vector_fmul_window(&q->bands[band_num][start_pos], prev_buf,
144
                                      &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 0, 16);
144
                                      &su->spectrum[0][ref_pos + start_pos], ff_sine_32, 16);
145 145

  
146 146
            prev_buf = &su->spectrum[0][ref_pos+start_pos + 16];
147 147
            start_pos += block_size;
libavcodec/dca.c
896 896
        s->synth.synth_filter_float(&s->imdct,
897 897
                              s->subband_fir_hist[chans], &s->hist_index[chans],
898 898
                              s->subband_fir_noidea[chans], prCoeff,
899
                              samples_out, s->raXin, scale, 0);
899
                              samples_out, s->raXin, scale);
900 900
        samples_out+= 32;
901 901

  
902 902
    }
......
929 929
    /* Interpolation */
930 930
    for (deciindex = 0; deciindex < num_deci_sample; deciindex++) {
931 931
        s->dcadsp.lfe_fir(samples_out, samples_in, prCoeff, decifactor,
932
                          scale, 0);
932
                          scale);
933 933
        samples_in++;
934 934
        samples_out += 2 * decifactor;
935 935
    }
libavcodec/dcadsp.c
23 23
#include "dcadsp.h"
24 24

  
25 25
static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
26
                          int decifactor, float scale, float bias)
26
                          int decifactor, float scale)
27 27
{
28 28
    float *out2 = out + decifactor;
29 29
    const float *cf0 = coefs;
......
39 39
            v0 += s * *cf0++;
40 40
            v1 += s * *--cf1;
41 41
        }
42
        *out++  = (v0 * scale) + bias;
43
        *out2++ = (v1 * scale) + bias;
42
        *out++  = v0 * scale;
43
        *out2++ = v1 * scale;
44 44
    }
45 45
}
46 46

  
libavcodec/dcadsp.h
21 21

  
22 22
typedef struct DCADSPContext {
23 23
    void (*lfe_fir)(float *out, const float *in, const float *coefs,
24
                    int decifactor, float scale, float bias);
24
                    int decifactor, float scale);
25 25
} DCADSPContext;
26 26

  
27 27
void ff_dcadsp_init(DCADSPContext *s);
libavcodec/dsputil.c
3776 3776
        dst[i] = src0[i] * src1[i] + src2[i];
3777 3777
}
3778 3778

  
3779
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3779
static void vector_fmul_window_c(float *dst, const float *src0,
3780
                                 const float *src1, const float *win, int len)
3781
{
3780 3782
    int i,j;
3781 3783
    dst += len;
3782 3784
    win += len;
......
3786 3788
        float s1 = src1[j];
3787 3789
        float wi = win[i];
3788 3790
        float wj = win[j];
3789
        dst[i] = s0*wj - s1*wi + add_bias;
3790
        dst[j] = s0*wi + s1*wj + add_bias;
3791
        dst[i] = s0*wj - s1*wi;
3792
        dst[j] = s0*wi + s1*wj;
3791 3793
    }
3792 3794
}
3793 3795

  
......
4434 4436
    c->vector_fmul = vector_fmul_c;
4435 4437
    c->vector_fmul_reverse = vector_fmul_reverse_c;
4436 4438
    c->vector_fmul_add = vector_fmul_add_c;
4437
    c->vector_fmul_window = ff_vector_fmul_window_c;
4439
    c->vector_fmul_window = vector_fmul_window_c;
4438 4440
    c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4439 4441
    c->vector_clipf = vector_clipf_c;
4440 4442
    c->float_to_int16 = ff_float_to_int16_c;
libavcodec/dsputil.h
68 68
void ff_svq3_luma_dc_dequant_idct_c(DCTELEM *output, DCTELEM *input, int qp);
69 69
void ff_svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
70 70

  
71
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
72
                             const float *win, float add_bias, int len);
73

  
74 71
/* encoding scans */
75 72
extern const uint8_t ff_alternate_horizontal_scan[64];
76 73
extern const uint8_t ff_alternate_vertical_scan[64];
......
393 390
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
394 391
    void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
395 392
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
396
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
393
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, int len);
397 394
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
398 395
    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
399 396
    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
libavcodec/ppc/float_altivec.c
90 90
    }
91 91
}
92 92

  
93
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len)
93
static void vector_fmul_window_altivec(float *dst, const float *src0, const float *src1, const float *win, int len)
94 94
{
95
    union {
96
        vector float v;
97
        float s[4];
98
    } vadd;
99
    vector float vadd_bias, zero, t0, t1, s0, s1, wi, wj;
95
    vector float zero, t0, t1, s0, s1, wi, wj;
100 96
    const vector unsigned char reverse = vcprm(3,2,1,0);
101 97
    int i,j;
102 98

  
......
104 100
    win += len;
105 101
    src0+= len;
106 102

  
107
    vadd.s[0] = add_bias;
108
    vadd_bias = vec_splat(vadd.v, 0);
109 103
    zero = (vector float)vec_splat_u32(0);
110 104

  
111 105
    for(i=-len*4, j=len*4-16; i<0; i+=16, j-=16) {
......
117 111
        s1 = vec_perm(s1, s1, reverse);
118 112
        wj = vec_perm(wj, wj, reverse);
119 113

  
120
        t0 = vec_madd(s0, wj, vadd_bias);
114
        t0 = vec_madd(s0, wj, zero);
121 115
        t0 = vec_nmsub(s1, wi, t0);
122
        t1 = vec_madd(s0, wi, vadd_bias);
116
        t1 = vec_madd(s0, wi, zero);
123 117
        t1 = vec_madd(s1, wj, t1);
124 118
        t1 = vec_perm(t1, t1, reverse);
125 119

  
libavcodec/synth_filter.c
24 24
static void synth_filter_float(FFTContext *imdct,
25 25
                           float *synth_buf_ptr, int *synth_buf_offset,
26 26
                           float synth_buf2[32], const float window[512],
27
                           float out[32], const float in[32], float scale, float bias)
27
                           float out[32], const float in[32], float scale)
28 28
{
29 29
    float *synth_buf= synth_buf_ptr + *synth_buf_offset;
30 30
    int i, j;
......
48 48
            c += window[i + j + 32]*( synth_buf[16 + i + j - 512]);
49 49
            d += window[i + j + 48]*( synth_buf[31 - i + j - 512]);
50 50
        }
51
        out[i     ] = a*scale + bias;
52
        out[i + 16] = b*scale + bias;
51
        out[i     ] = a*scale;
52
        out[i + 16] = b*scale;
53 53
        synth_buf2[i     ] = c;
54 54
        synth_buf2[i + 16] = d;
55 55
    }
libavcodec/synth_filter.h
28 28
                               float *synth_buf_ptr, int *synth_buf_offset,
29 29
                               float synth_buf2[32], const float window[512],
30 30
                               float out[32], const float in[32],
31
                               float scale, float bias);
31
                               float scale);
32 32
} SynthFilterContext;
33 33

  
34 34
void ff_synth_filter_init(SynthFilterContext *c);
libavcodec/twinvq.c
646 646
                                     prev_buf + (bsize-wsize)/2,
647 647
                                     buf1 + bsize*j,
648 648
                                     ff_sine_windows[av_log2(wsize)],
649
                                     0.0,
650 649
                                     wsize/2);
651 650
        out2 += wsize;
652 651

  
libavcodec/vorbis_dec.c
1575 1575
        const float *win  = vc->win[blockflag & previous_window];
1576 1576

  
1577 1577
        if (blockflag == previous_window) {
1578
            vc->dsp.vector_fmul_window(ret, saved, buf, win, 0, blocksize / 4);
1578
            vc->dsp.vector_fmul_window(ret, saved, buf, win, blocksize / 4);
1579 1579
        } else if (blockflag > previous_window) {
1580
            vc->dsp.vector_fmul_window(ret, saved, buf, win, 0, bs0 / 4);
1580
            vc->dsp.vector_fmul_window(ret, saved, buf, win, bs0 / 4);
1581 1581
            memcpy(ret+bs0/2, buf+bs0/4, ((bs1-bs0)/4) * sizeof(float));
1582 1582
        } else {
1583 1583
            memcpy(ret, saved, ((bs1 - bs0) / 4) * sizeof(float));
1584
            vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, 0, bs0 / 4);
1584
            vc->dsp.vector_fmul_window(ret + (bs1 - bs0) / 4, saved + (bs1 - bs0) / 4, buf, win, bs0 / 4);
1585 1585
        }
1586 1586
        memcpy(saved, buf + blocksize / 4, blocksize / 4 * sizeof(float));
1587 1587
    }
libavcodec/wmaprodec.c
1031 1031
        winlen >>= 1;
1032 1032

  
1033 1033
        s->dsp.vector_fmul_window(start, start, start + winlen,
1034
                                  window, 0, winlen);
1034
                                  window, winlen);
1035 1035

  
1036 1036
        s->channel[c].prev_block_len = s->subframe_len;
1037 1037
    }
libavcodec/x86/dsputil_mmx.c
2190 2190
    );
2191 2191
}
2192 2192

  
2193
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2194
                                      const float *win, float add_bias, int len){
2195 2193
#if HAVE_6REGS
2196
    if(add_bias == 0){
2194
static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2195
                                      const float *win, int len){
2197 2196
        x86_reg i = -len*4;
2198 2197
        x86_reg j = len*4-8;
2199 2198
        __asm__ volatile(
......
2220 2219
            :"+r"(i), "+r"(j)
2221 2220
            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2222 2221
        );
2223
    }else
2224
#endif
2225
        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
2226 2222
}
2227 2223

  
2228 2224
static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2229
                                   const float *win, float add_bias, int len){
2230
#if HAVE_6REGS
2231
    if(add_bias == 0){
2225
                                   const float *win, int len){
2232 2226
        x86_reg i = -len*4;
2233 2227
        x86_reg j = len*4-16;
2234 2228
        __asm__ volatile(
......
2256 2250
            :"+r"(i), "+r"(j)
2257 2251
            :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2258 2252
        );
2259
    }else
2260
#endif
2261
        ff_vector_fmul_window_c(dst, src0, src1, win, add_bias, len);
2262 2253
}
2254
#endif /* HAVE_6REGS */
2263 2255

  
2264 2256
static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
2265 2257
{
......
2882 2874
        }
2883 2875
        if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2884 2876
            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2877
#if HAVE_6REGS
2885 2878
            c->vector_fmul_window = vector_fmul_window_3dnow2;
2879
#endif
2886 2880
            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2887 2881
                c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
2888 2882
            }
......
2899 2893
            c->vector_fmul = vector_fmul_sse;
2900 2894
            c->vector_fmul_reverse = vector_fmul_reverse_sse;
2901 2895
            c->vector_fmul_add = vector_fmul_add_sse;
2896
#if HAVE_6REGS
2902 2897
            c->vector_fmul_window = vector_fmul_window_sse;
2898
#endif
2903 2899
            c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
2904 2900
            c->vector_clipf = vector_clipf_sse;
2905 2901
            c->float_to_int16 = float_to_int16_sse;

Also available in: Unified diff