Revision 015f9f1a
libavcodec/aacenc.c | ||
---|---|---|
256 | 256 |
s->output[i - 448 - k] = (i < 1024) |
257 | 257 |
? sce->saved[i] |
258 | 258 |
: audio[(i-1024)*chans]; |
259 |
s->dsp.vector_fmul (s->output, k ? swindow : pwindow, 128); |
|
259 |
s->dsp.vector_fmul (s->output, s->output, k ? swindow : pwindow, 128);
|
|
260 | 260 |
s->dsp.vector_fmul_reverse(s->output+128, s->output+128, swindow, 128); |
261 | 261 |
ff_mdct_calc(&s->mdct128, sce->coeffs + k, s->output); |
262 | 262 |
} |
libavcodec/arm/dsputil_init_neon.c | ||
---|---|---|
138 | 138 |
void ff_vp3_v_loop_filter_neon(uint8_t *, int, int *); |
139 | 139 |
void ff_vp3_h_loop_filter_neon(uint8_t *, int, int *); |
140 | 140 |
|
141 |
void ff_vector_fmul_neon(float *dst, const float *src, int len); |
|
141 |
void ff_vector_fmul_neon(float *dst, const float *src0, const float *src1, int len);
|
|
142 | 142 |
void ff_vector_fmul_window_neon(float *dst, const float *src0, |
143 | 143 |
const float *src1, const float *win, |
144 | 144 |
float add_bias, int len); |
libavcodec/arm/dsputil_init_vfp.c | ||
---|---|---|
21 | 21 |
#include "libavcodec/dsputil.h" |
22 | 22 |
#include "dsputil_arm.h" |
23 | 23 |
|
24 |
void ff_vector_fmul_vfp(float *dst, const float *src, int len); |
|
24 |
void ff_vector_fmul_vfp(float *dst, const float *src0, |
|
25 |
const float *src1, int len); |
|
25 | 26 |
void ff_vector_fmul_reverse_vfp(float *dst, const float *src0, |
26 | 27 |
const float *src1, int len); |
27 | 28 |
void ff_float_to_int16_vfp(int16_t *dst, const float *src, long len); |
libavcodec/arm/dsputil_neon.S | ||
---|---|---|
738 | 738 |
endfunc |
739 | 739 |
|
740 | 740 |
function ff_vector_fmul_neon, export=1 |
741 |
mov r3, r0 |
|
742 |
subs r2, r2, #8 |
|
743 |
vld1.64 {d0-d3}, [r0,:128]! |
|
744 |
vld1.64 {d4-d7}, [r1,:128]! |
|
741 |
subs r3, r3, #8 |
|
742 |
vld1.64 {d0-d3}, [r1,:128]! |
|
743 |
vld1.64 {d4-d7}, [r2,:128]! |
|
745 | 744 |
vmul.f32 q8, q0, q2 |
746 | 745 |
vmul.f32 q9, q1, q3 |
747 | 746 |
beq 3f |
748 |
bics ip, r2, #15
|
|
747 |
bics ip, r3, #15
|
|
749 | 748 |
beq 2f |
750 | 749 |
1: subs ip, ip, #16 |
751 |
vld1.64 {d0-d1}, [r0,:128]!
|
|
752 |
vld1.64 {d4-d5}, [r1,:128]!
|
|
750 |
vld1.64 {d0-d1}, [r1,:128]!
|
|
751 |
vld1.64 {d4-d5}, [r2,:128]!
|
|
753 | 752 |
vmul.f32 q10, q0, q2 |
754 |
vld1.64 {d2-d3}, [r0,:128]!
|
|
755 |
vld1.64 {d6-d7}, [r1,:128]!
|
|
753 |
vld1.64 {d2-d3}, [r1,:128]!
|
|
754 |
vld1.64 {d6-d7}, [r2,:128]!
|
|
756 | 755 |
vmul.f32 q11, q1, q3 |
757 |
vst1.64 {d16-d19},[r3,:128]!
|
|
758 |
vld1.64 {d0-d1}, [r0,:128]!
|
|
759 |
vld1.64 {d4-d5}, [r1,:128]!
|
|
756 |
vst1.64 {d16-d19},[r0,:128]!
|
|
757 |
vld1.64 {d0-d1}, [r1,:128]!
|
|
758 |
vld1.64 {d4-d5}, [r2,:128]!
|
|
760 | 759 |
vmul.f32 q8, q0, q2 |
761 |
vld1.64 {d2-d3}, [r0,:128]!
|
|
762 |
vld1.64 {d6-d7}, [r1,:128]!
|
|
760 |
vld1.64 {d2-d3}, [r1,:128]!
|
|
761 |
vld1.64 {d6-d7}, [r2,:128]!
|
|
763 | 762 |
vmul.f32 q9, q1, q3 |
764 |
vst1.64 {d20-d23},[r3,:128]!
|
|
763 |
vst1.64 {d20-d23},[r0,:128]!
|
|
765 | 764 |
bne 1b |
766 |
ands r2, r2, #15
|
|
765 |
ands r3, r3, #15
|
|
767 | 766 |
beq 3f |
768 |
2: vld1.64 {d0-d1}, [r0,:128]!
|
|
769 |
vld1.64 {d4-d5}, [r1,:128]!
|
|
770 |
vst1.64 {d16-d17},[r3,:128]!
|
|
767 |
2: vld1.64 {d0-d1}, [r1,:128]!
|
|
768 |
vld1.64 {d4-d5}, [r2,:128]!
|
|
769 |
vst1.64 {d16-d17},[r0,:128]!
|
|
771 | 770 |
vmul.f32 q8, q0, q2 |
772 |
vld1.64 {d2-d3}, [r0,:128]!
|
|
773 |
vld1.64 {d6-d7}, [r1,:128]!
|
|
774 |
vst1.64 {d18-d19},[r3,:128]!
|
|
771 |
vld1.64 {d2-d3}, [r1,:128]!
|
|
772 |
vld1.64 {d6-d7}, [r2,:128]!
|
|
773 |
vst1.64 {d18-d19},[r0,:128]!
|
|
775 | 774 |
vmul.f32 q9, q1, q3 |
776 |
3: vst1.64 {d16-d19},[r3,:128]!
|
|
775 |
3: vst1.64 {d16-d19},[r0,:128]!
|
|
777 | 776 |
bx lr |
778 | 777 |
endfunc |
779 | 778 |
|
libavcodec/arm/dsputil_vfp.S | ||
---|---|---|
41 | 41 |
* ARM VFP optimized implementation of 'vector_fmul_c' function. |
42 | 42 |
* Assume that len is a positive number and is multiple of 8 |
43 | 43 |
*/ |
44 |
@ void ff_vector_fmul_vfp(float *dst, const float *src, int len) |
|
44 |
@ void ff_vector_fmul_vfp(float *dst, const float *src0, const float *src1, int len)
|
|
45 | 45 |
function ff_vector_fmul_vfp, export=1 |
46 | 46 |
vpush {d8-d15} |
47 |
mov r3, r0 |
|
48 | 47 |
fmrx r12, fpscr |
49 | 48 |
orr r12, r12, #(3 << 16) /* set vector size to 4 */ |
50 | 49 |
fmxr fpscr, r12 |
51 | 50 |
|
52 |
vldmia r3!, {s0-s3}
|
|
53 |
vldmia r1!, {s8-s11}
|
|
54 |
vldmia r3!, {s4-s7}
|
|
55 |
vldmia r1!, {s12-s15}
|
|
51 |
vldmia r1!, {s0-s3}
|
|
52 |
vldmia r2!, {s8-s11}
|
|
53 |
vldmia r1!, {s4-s7}
|
|
54 |
vldmia r2!, {s12-s15}
|
|
56 | 55 |
vmul.f32 s8, s0, s8 |
57 | 56 |
1: |
58 |
subs r2, r2, #16
|
|
57 |
subs r3, r3, #16
|
|
59 | 58 |
vmul.f32 s12, s4, s12 |
60 |
vldmiage r3!, {s16-s19}
|
|
61 |
vldmiage r1!, {s24-s27}
|
|
62 |
vldmiage r3!, {s20-s23}
|
|
63 |
vldmiage r1!, {s28-s31}
|
|
59 |
vldmiage r1!, {s16-s19}
|
|
60 |
vldmiage r2!, {s24-s27}
|
|
61 |
vldmiage r1!, {s20-s23}
|
|
62 |
vldmiage r2!, {s28-s31}
|
|
64 | 63 |
vmulge.f32 s24, s16, s24 |
65 | 64 |
vstmia r0!, {s8-s11} |
66 | 65 |
vstmia r0!, {s12-s15} |
67 | 66 |
vmulge.f32 s28, s20, s28 |
68 |
vldmiagt r3!, {s0-s3}
|
|
69 |
vldmiagt r1!, {s8-s11}
|
|
70 |
vldmiagt r3!, {s4-s7}
|
|
71 |
vldmiagt r1!, {s12-s15}
|
|
67 |
vldmiagt r1!, {s0-s3}
|
|
68 |
vldmiagt r2!, {s8-s11}
|
|
69 |
vldmiagt r1!, {s4-s7}
|
|
70 |
vldmiagt r2!, {s12-s15}
|
|
72 | 71 |
vmulge.f32 s8, s0, s8 |
73 | 72 |
vstmiage r0!, {s24-s27} |
74 | 73 |
vstmiage r0!, {s28-s31} |
libavcodec/atrac3.c | ||
---|---|---|
159 | 159 |
ff_imdct_calc(&q->mdct_ctx,pOutput,pInput); |
160 | 160 |
|
161 | 161 |
/* Perform windowing on the output. */ |
162 |
dsp.vector_fmul(pOutput,mdct_window,512);
|
|
162 |
dsp.vector_fmul(pOutput, pOutput, mdct_window, 512);
|
|
163 | 163 |
|
164 | 164 |
} |
165 | 165 |
|
libavcodec/dsputil.c | ||
---|---|---|
3750 | 3750 |
WRAPPER8_16_SQ(rd8x8_c, rd16_c) |
3751 | 3751 |
WRAPPER8_16_SQ(bit8x8_c, bit16_c) |
3752 | 3752 |
|
3753 |
static void vector_fmul_c(float *dst, const float *src, int len){ |
|
3753 |
static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
|
|
3754 | 3754 |
int i; |
3755 | 3755 |
for(i=0; i<len; i++) |
3756 |
dst[i] *= src[i];
|
|
3756 |
dst[i] = src0[i] * src1[i];
|
|
3757 | 3757 |
} |
3758 | 3758 |
|
3759 | 3759 |
static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){ |
libavcodec/dsputil.h | ||
---|---|---|
375 | 375 |
void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize); |
376 | 376 |
void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len); |
377 | 377 |
/* assume len is a multiple of 8, and arrays are 16-byte aligned */ |
378 |
void (*vector_fmul)(float *dst, const float *src, int len); |
|
378 |
void (*vector_fmul)(float *dst, const float *src0, const float *src1, int len);
|
|
379 | 379 |
void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len); |
380 | 380 |
/* assume len is a multiple of 8, and src arrays are 16-byte aligned */ |
381 | 381 |
void (*vector_fmul_add)(float *dst, const float *src0, const float *src1, const float *src2, int len); |
libavcodec/nellymoserenc.c | ||
---|---|---|
113 | 113 |
|
114 | 114 |
static void apply_mdct(NellyMoserEncodeContext *s) |
115 | 115 |
{ |
116 |
memcpy(s->in_buff, s->buf[s->bufsel], NELLY_BUF_LEN * sizeof(float)); |
|
117 |
s->dsp.vector_fmul(s->in_buff, ff_sine_128, NELLY_BUF_LEN); |
|
116 |
s->dsp.vector_fmul(s->in_buff, s->buf[s->bufsel], ff_sine_128, NELLY_BUF_LEN); |
|
118 | 117 |
s->dsp.vector_fmul_reverse(s->in_buff + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, |
119 | 118 |
NELLY_BUF_LEN); |
120 | 119 |
ff_mdct_calc(&s->mdct_ctx, s->mdct_out, s->in_buff); |
121 | 120 |
|
122 |
s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, ff_sine_128, NELLY_BUF_LEN); |
|
121 |
s->dsp.vector_fmul(s->buf[s->bufsel] + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN, |
|
122 |
ff_sine_128, NELLY_BUF_LEN); |
|
123 | 123 |
s->dsp.vector_fmul_reverse(s->buf[s->bufsel] + 2 * NELLY_BUF_LEN, s->buf[1 - s->bufsel], ff_sine_128, |
124 | 124 |
NELLY_BUF_LEN); |
125 | 125 |
ff_mdct_calc(&s->mdct_ctx, s->mdct_out + NELLY_BUF_LEN, s->buf[s->bufsel] + NELLY_BUF_LEN); |
libavcodec/ppc/float_altivec.c | ||
---|---|---|
23 | 23 |
#include "dsputil_altivec.h" |
24 | 24 |
#include "util_altivec.h" |
25 | 25 |
|
26 |
static void vector_fmul_altivec(float *dst, const float *src, int len) |
|
26 |
static void vector_fmul_altivec(float *dst, const float *src0, const float *src1, int len)
|
|
27 | 27 |
{ |
28 | 28 |
int i; |
29 | 29 |
vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); |
30 | 30 |
for(i=0; i<len-7; i+=8) { |
31 |
d0 = vec_ld(0, dst+i);
|
|
32 |
s = vec_ld(0, src+i); |
|
33 |
d1 = vec_ld(16, dst+i);
|
|
31 |
d0 = vec_ld(0, src0+i);
|
|
32 |
s = vec_ld(0, src1+i);
|
|
33 |
d1 = vec_ld(16, src0+i);
|
|
34 | 34 |
d0 = vec_madd(d0, s, zero); |
35 |
d1 = vec_madd(d1, vec_ld(16,src+i), zero); |
|
35 |
d1 = vec_madd(d1, vec_ld(16,src1+i), zero);
|
|
36 | 36 |
vec_st(d0, 0, dst+i); |
37 | 37 |
vec_st(d1, 16, dst+i); |
38 | 38 |
} |
libavcodec/twinvq.c | ||
---|---|---|
783 | 783 |
dec_bark_env(tctx, bark1[i][j], bark_use_hist[i][j], i, |
784 | 784 |
tctx->tmp_buf, gain[sub*i+j], ftype); |
785 | 785 |
|
786 |
tctx->dsp.vector_fmul(chunk + block_size*j, tctx->tmp_buf, |
|
786 |
tctx->dsp.vector_fmul(chunk + block_size*j, chunk + block_size*j, tctx->tmp_buf,
|
|
787 | 787 |
block_size); |
788 | 788 |
|
789 | 789 |
} |
... | ... | |
805 | 805 |
dec_lpc_spectrum_inv(tctx, lsp, ftype, tctx->tmp_buf); |
806 | 806 |
|
807 | 807 |
for (j = 0; j < mtab->fmode[ftype].sub; j++) { |
808 |
tctx->dsp.vector_fmul(chunk, tctx->tmp_buf, block_size); |
|
808 |
tctx->dsp.vector_fmul(chunk, chunk, tctx->tmp_buf, block_size);
|
|
809 | 809 |
chunk += block_size; |
810 | 810 |
} |
811 | 811 |
} |
libavcodec/vorbis_dec.c | ||
---|---|---|
1578 | 1578 |
for (j = vc->audio_channels-1;j >= 0; j--) { |
1579 | 1579 |
ch_floor_ptr = vc->channel_floors + j * blocksize / 2; |
1580 | 1580 |
ch_res_ptr = vc->channel_residues + res_chan[j] * blocksize / 2; |
1581 |
vc->dsp.vector_fmul(ch_floor_ptr, ch_res_ptr, blocksize / 2); |
|
1581 |
vc->dsp.vector_fmul(ch_floor_ptr, ch_floor_ptr, ch_res_ptr, blocksize / 2);
|
|
1582 | 1582 |
ff_imdct_half(&vc->mdct[blockflag], ch_res_ptr, ch_floor_ptr); |
1583 | 1583 |
} |
1584 | 1584 |
|
libavcodec/x86/dsputil_mmx.c | ||
---|---|---|
2074 | 2074 |
} |
2075 | 2075 |
} |
2076 | 2076 |
|
2077 |
static void vector_fmul_3dnow(float *dst, const float *src, int len){ |
|
2077 |
static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
|
|
2078 | 2078 |
x86_reg i = (len-4)*4; |
2079 | 2079 |
__asm__ volatile( |
2080 | 2080 |
"1: \n\t" |
2081 |
"movq (%1,%0), %%mm0 \n\t"
|
|
2082 |
"movq 8(%1,%0), %%mm1 \n\t"
|
|
2083 |
"pfmul (%2,%0), %%mm0 \n\t"
|
|
2084 |
"pfmul 8(%2,%0), %%mm1 \n\t"
|
|
2081 |
"movq (%2,%0), %%mm0 \n\t"
|
|
2082 |
"movq 8(%2,%0), %%mm1 \n\t"
|
|
2083 |
"pfmul (%3,%0), %%mm0 \n\t"
|
|
2084 |
"pfmul 8(%3,%0), %%mm1 \n\t"
|
|
2085 | 2085 |
"movq %%mm0, (%1,%0) \n\t" |
2086 | 2086 |
"movq %%mm1, 8(%1,%0) \n\t" |
2087 | 2087 |
"sub $16, %0 \n\t" |
2088 | 2088 |
"jge 1b \n\t" |
2089 | 2089 |
"femms \n\t" |
2090 | 2090 |
:"+r"(i) |
2091 |
:"r"(dst), "r"(src) |
|
2091 |
:"r"(dst), "r"(src0), "r"(src1)
|
|
2092 | 2092 |
:"memory" |
2093 | 2093 |
); |
2094 | 2094 |
} |
2095 |
static void vector_fmul_sse(float *dst, const float *src, int len){ |
|
2095 |
static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
|
|
2096 | 2096 |
x86_reg i = (len-8)*4; |
2097 | 2097 |
__asm__ volatile( |
2098 | 2098 |
"1: \n\t" |
2099 |
"movaps (%1,%0), %%xmm0 \n\t"
|
|
2100 |
"movaps 16(%1,%0), %%xmm1 \n\t"
|
|
2101 |
"mulps (%2,%0), %%xmm0 \n\t"
|
|
2102 |
"mulps 16(%2,%0), %%xmm1 \n\t"
|
|
2099 |
"movaps (%2,%0), %%xmm0 \n\t"
|
|
2100 |
"movaps 16(%2,%0), %%xmm1 \n\t"
|
|
2101 |
"mulps (%3,%0), %%xmm0 \n\t"
|
|
2102 |
"mulps 16(%3,%0), %%xmm1 \n\t"
|
|
2103 | 2103 |
"movaps %%xmm0, (%1,%0) \n\t" |
2104 | 2104 |
"movaps %%xmm1, 16(%1,%0) \n\t" |
2105 | 2105 |
"sub $32, %0 \n\t" |
2106 | 2106 |
"jge 1b \n\t" |
2107 | 2107 |
:"+r"(i) |
2108 |
:"r"(dst), "r"(src) |
|
2108 |
:"r"(dst), "r"(src0), "r"(src1)
|
|
2109 | 2109 |
:"memory" |
2110 | 2110 |
); |
2111 | 2111 |
} |
Also available in: Unified diff