Revision 015f9f1a

View differences:

libavcodec/aacenc.c
256 256
                s->output[i - 448 - k] = (i < 1024)
257 257
                                         ? sce->saved[i]
258 258
                                         : audio[(i-1024)*chans];
259
            s->dsp.vector_fmul        (s->output,     k ?  swindow : pwindow, 128);
259
            s->dsp.vector_fmul        (s->output,     s->output, k ?  swindow : pwindow, 128);
260 260
            s->dsp.vector_fmul_reverse(s->output+128, s->output+128, swindow, 128);
261 261
            ff_mdct_calc(&s->mdct128, sce->coeffs + k, s->output);
262 262
        }
libavcodec/arm/dsputil_init_neon.c
138 138
void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *);
139 139
void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *);
140 140

  
141
void ff_vector_fmul_neon(float *dst, const float *src, int len);
141
void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
142 142
void ff_vector_fmul_window_neon(float *dst, const float *src0,
143 143
                                const float *src1, const float *win,
144 144
                                float add_bias, int len);
libavcodec/arm/dsputil_init_vfp.c
21 21
#include "libavcodec/dsputil.h"
22 22
#include "dsputil_arm.h"
23 23

  
24
void ff_vector_fmul_vfp(float *dst, const float *src, int len);
24
void ff_vector_fmul_vfp(float *dst, const float *src0,
25
                        const float *src1, int len);
25 26
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0,
26 27
                                const float *src1, int len);
27 28
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len);
libavcodec/arm/dsputil_neon.S
738 738
endfunc
739 739

  
740 740
function ff_vector_fmul_neon, export=1
741
        mov             r3,  r0
742
        subs            r2,  r2,  #8
743
        vld1.64         {d0-d3},  [r0,:128]!
744
        vld1.64         {d4-d7},  [r1,:128]!
741
        subs            r3,  r3,  #8
742
        vld1.64         {d0-d3},  [r1,:128]!
743
        vld1.64         {d4-d7},  [r2,:128]!
745 744
        vmul.f32        q8,  q0,  q2
746 745
        vmul.f32        q9,  q1,  q3
747 746
        beq             3f
748
        bics            ip,  r2,  #15
747
        bics            ip,  r3,  #15
749 748
        beq             2f
750 749
1:      subs            ip,  ip,  #16
751
        vld1.64         {d0-d1},  [r0,:128]!
752
        vld1.64         {d4-d5},  [r1,:128]!
750
        vld1.64         {d0-d1},  [r1,:128]!
751
        vld1.64         {d4-d5},  [r2,:128]!
753 752
        vmul.f32        q10, q0,  q2
754
        vld1.64         {d2-d3},  [r0,:128]!
755
        vld1.64         {d6-d7},  [r1,:128]!
753
        vld1.64         {d2-d3},  [r1,:128]!
754
        vld1.64         {d6-d7},  [r2,:128]!
756 755
        vmul.f32        q11, q1,  q3
757
        vst1.64         {d16-d19},[r3,:128]!
758
        vld1.64         {d0-d1},  [r0,:128]!
759
        vld1.64         {d4-d5},  [r1,:128]!
756
        vst1.64         {d16-d19},[r0,:128]!
757
        vld1.64         {d0-d1},  [r1,:128]!
758
        vld1.64         {d4-d5},  [r2,:128]!
760 759
        vmul.f32        q8,  q0,  q2
761
        vld1.64         {d2-d3},  [r0,:128]!
762
        vld1.64         {d6-d7},  [r1,:128]!
760
        vld1.64         {d2-d3},  [r1,:128]!
761
        vld1.64         {d6-d7},  [r2,:128]!
763 762
        vmul.f32        q9,  q1,  q3
764
        vst1.64         {d20-d23},[r3,:128]!
763
        vst1.64         {d20-d23},[r0,:128]!
765 764
        bne             1b
766
        ands            r2,  r2,  #15
765
        ands            r3,  r3,  #15
767 766
        beq             3f
768
2:      vld1.64         {d0-d1},  [r0,:128]!
769
        vld1.64         {d4-d5},  [r1,:128]!
770
        vst1.64         {d16-d17},[r3,:128]!
767
2:      vld1.64         {d0-d1},  [r1,:128]!
768
        vld1.64         {d4-d5},  [r2,:128]!
769
        vst1.64         {d16-d17},[r0,:128]!
771 770
        vmul.f32        q8,  q0,  q2
772
        vld1.64         {d2-d3},  [r0,:128]!
773
        vld1.64         {d6-d7},  [r1,:128]!
774
        vst1.64         {d18-d19},[r3,:128]!
771
        vld1.64         {d2-d3},  [r1,:128]!
772
        vld1.64         {d6-d7},  [r2,:128]!
773
        vst1.64         {d18-d19},[r0,:128]!
775 774
        vmul.f32        q9,  q1,  q3
776
3:      vst1.64         {d16-d19},[r3,:128]!
775
3:      vst1.64         {d16-d19},[r0,:128]!
777 776
        bx              lr
778 777
endfunc
779 778

  
libavcodec/arm/dsputil_vfp.S
41 41
 * ARM VFP optimized implementation of 'vector_fmul_c' function.
42 42
 * Assume that len is a positive number and is multiple of 8
43 43
 */
44
@ void ff_vector_fmul_vfp(float *dst, const float *src, int len)
44
@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
45 45
function ff_vector_fmul_vfp, export=1
46 46
        vpush           {d8-d15}
47
        mov             r3,  r0
48 47
        fmrx            r12, fpscr
49 48
        orr             r12, r12, #(3 << 16) /* set vector size to 4 */
50 49
        fmxr            fpscr, r12
51 50

  
52
        vldmia          r3!, {s0-s3}
53
        vldmia          r1!, {s8-s11}
54
        vldmia          r3!, {s4-s7}
55
        vldmia          r1!, {s12-s15}
51
        vldmia          r1!, {s0-s3}
52
        vldmia          r2!, {s8-s11}
53
        vldmia          r1!, {s4-s7}
54
        vldmia          r2!, {s12-s15}
56 55
        vmul.f32        s8,  s0,  s8
57 56
1:
58
        subs            r2,  r2,  #16
57
        subs            r3,  r3,  #16
59 58
        vmul.f32        s12, s4,  s12
60
        vldmiage        r3!, {s16-s19}
61
        vldmiage        r1!, {s24-s27}
62
        vldmiage        r3!, {s20-s23}
63
        vldmiage        r1!, {s28-s31}
59
        vldmiage        r1!, {s16-s19}
60
        vldmiage        r2!, {s24-s27}
61
        vldmiage        r1!, {s20-s23}
62
        vldmiage        r2!, {s28-s31}
64 63
        vmulge.f32      s24, s16, s24
65 64
        vstmia          r0!, {s8-s11}
66 65
        vstmia          r0!, {s12-s15}
67 66
        vmulge.f32      s28, s20, s28
68
        vldmiagt        r3!, {s0-s3}
69
        vldmiagt        r1!, {s8-s11}
70
        vldmiagt        r3!, {s4-s7}
71
        vldmiagt        r1!, {s12-s15}
67
        vldmiagt        r1!, {s0-s3}
68
        vldmiagt        r2!, {s8-s11}
69
        vldmiagt        r1!, {s4-s7}
70
        vldmiagt        r2!, {s12-s15}
72 71
        vmulge.f32      s8,  s0,  s8
73 72
        vstmiage        r0!, {s24-s27}
74 73
        vstmiage        r0!, {s28-s31}
libavcodec/atrac3.c
159 159
    ff_imdct_calc(&q->mdct_ctx,pOutput,pInput);
160 160

  
161 161
    /* Perform windowing on the output. */
162
    dsp.vector_fmul(pOutput,mdct_window,512);
162
    dsp.vector_fmul(pOutput, pOutput, mdct_window, 512);
163 163

  
164 164
}
165 165

  
libavcodec/dsputil.c
3750 3750
WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3751 3751
WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3752 3752

  
3753
static void vector_fmul_c(float *dst, const float *src, int len){
3753
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3754 3754
    int i;
3755 3755
    for(i=0; i<len; i++)
3756
        dst[i] *= src[i];
3756
        dst[i] = src0[i] * src1[i];
3757 3757
}
3758 3758

  
3759 3759
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
libavcodec/dsputil.h
375 375
    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
376 376
    void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
377 377
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
378
    void (*vector_fmul)(float *dst, const float *src, int len);
378
    void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len);
379 379
    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
380 380
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
381 381
    void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len);
libavcodec/nellymoserenc.c
113 113

  
114 114
static void apply_mdct(NellyMoserEncodeContext *s)
115 115
{
116
    memcpy(s->in_buff, s->buf[s->bufsel], NELLY_BUF_LEN * sizeof(float));
117
    s->dsp.vector_fmul(s->in_buff, ff_sine_128, NELLY_BUF_LEN);
116
    s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN);
118 117
    s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128,
119 118
                               NELLY_BUF_LEN);
120 119
    ff_mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff);
121 120

  
122
    s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, NELLY_BUF_LEN);
121
    s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN,
122
                       ff_sine_128, NELLY_BUF_LEN);
123 123
    s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128,
124 124
                               NELLY_BUF_LEN);
125 125
    ff_mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN);
libavcodec/ppc/float_altivec.c
23 23
#include "dsputil_altivec.h"
24 24
#include "util_altivec.h"
25 25

  
26
static void vector_fmul_altivec(float *dst, const float *src, int len)
26
static void vector_fmul_altivec(float *dst, const float *src0, const float *src1, int len)
27 27
{
28 28
    int i;
29 29
    vector float d0, d1, s, zero = (vector float)vec_splat_u32(0);
30 30
    for(i=0; i<len-7; i+=8) {
31
        d0 = vec_ld(0, dst+i);
32
        s = vec_ld(0, src+i);
33
        d1 = vec_ld(16, dst+i);
31
        d0 = vec_ld(0, src0+i);
32
        s = vec_ld(0, src1+i);
33
        d1 = vec_ld(16, src0+i);
34 34
        d0 = vec_madd(d0, s, zero);
35
        d1 = vec_madd(d1, vec_ld(16,src+i), zero);
35
        d1 = vec_madd(d1, vec_ld(16,src1+i), zero);
36 36
        vec_st(d0, 0, dst+i);
37 37
        vec_st(d1, 16, dst+i);
38 38
    }
libavcodec/twinvq.c
783 783
            dec_bark_env(tctx, bark1[i][j], bark_use_hist[i][j], i,
784 784
                         tctx->tmp_buf, gain[sub*i+j], ftype);
785 785

  
786
            tctx->dsp.vector_fmul(chunk + block_size*j, tctx->tmp_buf,
786
            tctx->dsp.vector_fmul(chunk + block_size*j, chunk + block_size*j, tctx->tmp_buf,
787 787
                                  block_size);
788 788

  
789 789
        }
......
805 805
        dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf);
806 806

  
807 807
        for (j = 0; j < mtab->fmode[ftype].sub; j++) {
808
            tctx->dsp.vector_fmul(chunk, tctx->tmp_buf, block_size);
808
            tctx->dsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
809 809
            chunk += block_size;
810 810
        }
811 811
    }
libavcodec/vorbis_dec.c
1578 1578
    for (j = vc->audio_channels-1;j >= 0; j--) {
1579 1579
        ch_floor_ptr = vc->channel_floors   + j           * blocksize / 2;
1580 1580
        ch_res_ptr   = vc->channel_residues + res_chan[j] * blocksize / 2;
1581
        vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize / 2);
1581
        vc->dsp.vector_fmul(ch_floor_ptr, ch_floor_ptr, ch_res_ptr, blocksize / 2);
1582 1582
        ff_imdct_half(&vc->mdct[blockflag], ch_res_ptr, ch_floor_ptr);
1583 1583
    }
1584 1584

  
libavcodec/x86/dsputil_mmx.c
2074 2074
    }
2075 2075
}
2076 2076

  
2077
static void vector_fmul_3dnow(float *dst, const float *src, int len){
2077
static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2078 2078
    x86_reg i = (len-4)*4;
2079 2079
    __asm__ volatile(
2080 2080
        "1: \n\t"
2081
        "movq    (%1,%0), %%mm0 \n\t"
2082
        "movq   8(%1,%0), %%mm1 \n\t"
2083
        "pfmul   (%2,%0), %%mm0 \n\t"
2084
        "pfmul  8(%2,%0), %%mm1 \n\t"
2081
        "movq    (%2,%0), %%mm0 \n\t"
2082
        "movq   8(%2,%0), %%mm1 \n\t"
2083
        "pfmul   (%3,%0), %%mm0 \n\t"
2084
        "pfmul  8(%3,%0), %%mm1 \n\t"
2085 2085
        "movq   %%mm0,  (%1,%0) \n\t"
2086 2086
        "movq   %%mm1, 8(%1,%0) \n\t"
2087 2087
        "sub  $16, %0 \n\t"
2088 2088
        "jge 1b \n\t"
2089 2089
        "femms  \n\t"
2090 2090
        :"+r"(i)
2091
        :"r"(dst), "r"(src)
2091
        :"r"(dst), "r"(src0), "r"(src1)
2092 2092
        :"memory"
2093 2093
    );
2094 2094
}
2095
static void vector_fmul_sse(float *dst, const float *src, int len){
2095
static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2096 2096
    x86_reg i = (len-8)*4;
2097 2097
    __asm__ volatile(
2098 2098
        "1: \n\t"
2099
        "movaps    (%1,%0), %%xmm0 \n\t"
2100
        "movaps  16(%1,%0), %%xmm1 \n\t"
2101
        "mulps     (%2,%0), %%xmm0 \n\t"
2102
        "mulps   16(%2,%0), %%xmm1 \n\t"
2099
        "movaps    (%2,%0), %%xmm0 \n\t"
2100
        "movaps  16(%2,%0), %%xmm1 \n\t"
2101
        "mulps     (%3,%0), %%xmm0 \n\t"
2102
        "mulps   16(%3,%0), %%xmm1 \n\t"
2103 2103
        "movaps  %%xmm0,   (%1,%0) \n\t"
2104 2104
        "movaps  %%xmm1, 16(%1,%0) \n\t"
2105 2105
        "sub  $32, %0 \n\t"
2106 2106
        "jge 1b \n\t"
2107 2107
        :"+r"(i)
2108
        :"r"(dst), "r"(src)
2108
        :"r"(dst), "r"(src0), "r"(src1)
2109 2109
        :"memory"
2110 2110
    );
2111 2111
}

Also available in: Unified diff