Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / mdct_neon.S @ d650574e

History | View | Annotate | Download (12.2 KB)

1
/*
2
 * ARM NEON optimised MDCT
3
 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "asm.S"
23

    
24
        .fpu neon
25
        .text
26

    
27
function ff_imdct_half_neon, export=1
28
        push            {r4-r8,lr}
29

    
30
        mov             r12, #1
31
        ldr             lr,  [r0, #28]          @ mdct_bits
32
        ldr             r4,  [r0, #32]          @ tcos
33
        ldr             r3,  [r0, #8]           @ revtab
34
        lsl             r12, r12, lr            @ n  = 1 << nbits
35
        lsr             lr,  r12, #2            @ n4 = n >> 2
36
        add             r7,  r2,  r12,  lsl #1
37
        mov             r12, #-16
38
        sub             r7,  r7,  #16
39

    
40
        vld2.32         {d16-d17},[r7,:128],r12 @ d16=x,n1 d17=x,n0
41
        vld2.32         {d0-d1},  [r2,:128]!    @ d0 =m0,x d1 =m1,x
42
        vrev64.32       d17, d17
43
        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
44
        vmul.f32        d6,  d17, d2
45
        vmul.f32        d7,  d0,  d2
46
1:
47
        subs            lr,  lr,  #2
48
        ldr             r6,  [r3], #4
49
        vmul.f32        d4,  d0,  d3
50
        vmul.f32        d5,  d17, d3
51
        vsub.f32        d4,  d6,  d4
52
        vadd.f32        d5,  d5,  d7
53
        uxth            r8,  r6,  ror #16
54
        uxth            r6,  r6
55
        add             r8,  r1,  r8,  lsl #3
56
        add             r6,  r1,  r6,  lsl #3
57
        beq             1f
58
        vld2.32         {d16-d17},[r7,:128],r12
59
        vld2.32         {d0-d1},  [r2,:128]!
60
        vrev64.32       d17, d17
61
        vld2.32         {d2,d3},  [r4,:128]!    @ d2=c0,c1 d3=s0,s2
62
        vmul.f32        d6,  d17, d2
63
        vmul.f32        d7,  d0,  d2
64
        vst2.32         {d4[0],d5[0]}, [r6,:64]
65
        vst2.32         {d4[1],d5[1]}, [r8,:64]
66
        b               1b
67
1:
68
        vst2.32         {d4[0],d5[0]}, [r6,:64]
69
        vst2.32         {d4[1],d5[1]}, [r8,:64]
70

    
71
        mov             r4,  r0
72
        mov             r6,  r1
73
        bl              ff_fft_calc_neon
74

    
75
        mov             r12, #1
76
        ldr             lr,  [r4, #28]          @ mdct_bits
77
        ldr             r4,  [r4, #32]          @ tcos
78
        lsl             r12, r12, lr            @ n  = 1 << nbits
79
        lsr             lr,  r12, #3            @ n8 = n >> 3
80

    
81
        add             r4,  r4,  lr,  lsl #3
82
        add             r6,  r6,  lr,  lsl #3
83
        sub             r1,  r4,  #16
84
        sub             r3,  r6,  #16
85

    
86
        mov             r7,  #-16
87
        mov             r8,  r6
88
        mov             r0,  r3
89

    
90
        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =i1,r1 d1 =i0,r0
91
        vld2.32         {d20-d21},[r6,:128]!    @ d20=i2,r2 d21=i3,r3
92
        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
93
1:
94
        subs            lr,  lr,  #2
95
        vmul.f32        d7,  d0,  d18
96
        vld2.32         {d17,d19},[r4,:128]!    @ d17=c2,c3 d19=s2,s3
97
        vmul.f32        d4,  d1,  d18
98
        vmul.f32        d5,  d21, d19
99
        vmul.f32        d6,  d20, d19
100
        vmul.f32        d22, d1,  d16
101
        vmul.f32        d23, d21, d17
102
        vmul.f32        d24, d0,  d16
103
        vmul.f32        d25, d20, d17
104
        vadd.f32        d7,  d7,  d22
105
        vadd.f32        d6,  d6,  d23
106
        vsub.f32        d4,  d4,  d24
107
        vsub.f32        d5,  d5,  d25
108
        beq             1f
109
        vld2.32         {d0-d1},  [r3,:128], r7
110
        vld2.32         {d20-d21},[r6,:128]!
111
        vld2.32         {d16,d18},[r1,:128], r7 @ d16=c1,c0 d18=s1,s0
112
        vrev64.32       q3,  q3
113
        vst2.32         {d4,d6},  [r0,:128], r7
114
        vst2.32         {d5,d7},  [r8,:128]!
115
        b               1b
116
1:
117
        vrev64.32       q3,  q3
118
        vst2.32         {d4,d6},  [r0,:128]
119
        vst2.32         {d5,d7},  [r8,:128]
120

    
121
        pop             {r4-r8,pc}
122
.endfunc
123

    
124
function ff_imdct_calc_neon, export=1
125
        push            {r4-r6,lr}
126

    
127
        ldr             r3,  [r0, #28]
128
        mov             r4,  #1
129
        mov             r5,  r1
130
        lsl             r4,  r4,  r3
131
        add             r1,  r1,  r4
132

    
133
        bl              ff_imdct_half_neon
134

    
135
        add             r0,  r5,  r4,  lsl #2
136
        add             r1,  r5,  r4,  lsl #1
137
        sub             r0,  r0,  #8
138
        sub             r2,  r1,  #16
139
        mov             r3,  #-16
140
        mov             r6,  #-8
141
        vmov.i32        d30, #1<<31
142
1:
143
        vld1.32         {d0-d1},  [r2,:128], r3
144
        pld             [r0, #-16]
145
        vrev64.32       q0,  q0
146
        vld1.32         {d2-d3},  [r1,:128]!
147
        veor            d4,  d1,  d30
148
        pld             [r2, #-16]
149
        vrev64.32       q1,  q1
150
        veor            d5,  d0,  d30
151
        vst1.32         {d2},     [r0,:64], r6
152
        vst1.32         {d3},     [r0,:64], r6
153
        vst1.32         {d4-d5},  [r5,:128]!
154
        subs            r4,  r4,  #16
155
        bgt             1b
156

    
157
        pop             {r4-r6,pc}
158
.endfunc
159

    
160
function ff_mdct_calc_neon, export=1
161
        push            {r4-r10,lr}
162

    
163
        mov             r12, #1
164
        ldr             lr,  [r0, #28]          @ mdct_bits
165
        ldr             r4,  [r0, #32]          @ tcos
166
        ldr             r3,  [r0, #8]           @ revtab
167
        lsl             lr,  r12, lr            @ n  = 1 << nbits
168
        add             r7,  r2,  lr            @ in4u
169
        sub             r9,  r7,  #16           @ in4d
170
        add             r2,  r7,  lr,  lsl #1   @ in3u
171
        add             r8,  r9,  lr,  lsl #1   @ in3d
172
        add             r5,  r4,  lr,  lsl #1
173
        sub             r5,  r5,  #16
174
        sub             r3,  r3,  #4
175
        mov             r12, #-16
176

    
177
        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
178
        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
179
        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
180
        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
181
        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
182
        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
183
        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
184
        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
185
        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
186
        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
187
        vsub.f32        d16, d16, d2            @ in0u-in2d      R
188
        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
189
1:
190
        vmul.f32        d7,  d0,  d21           @  I*s
191
        ldr             r10, [r3, lr, lsr #1]
192
        vmul.f32        d6,  d1,  d20           @ -R*c
193
        ldr             r6,  [r3, #4]!
194
        vmul.f32        d4,  d1,  d21           @ -R*s
195
        vmul.f32        d5,  d0,  d20           @  I*c
196
        vmul.f32        d24, d16, d30           @  R*c
197
        vmul.f32        d25, d17, d31           @ -I*s
198
        vmul.f32        d22, d16, d31           @  R*s
199
        vmul.f32        d23, d17, d30           @  I*c
200
        subs            lr,  lr,  #16
201
        vsub.f32        d6,  d6,  d7            @ -R*c-I*s
202
        vadd.f32        d7,  d4,  d5            @ -R*s+I*c
203
        vsub.f32        d24, d25, d24           @ I*s-R*c
204
        vadd.f32        d25, d22, d23           @ R*s-I*c
205
        beq             1f
206
        mov             r12, #-16
207
        vld2.32         {d16,d18},[r9,:128],r12 @ in0u0,in0u1 in4d1,in4d0
208
        vld2.32         {d17,d19},[r8,:128],r12 @ in2u0,in2u1 in3d1,in3d0
209
        vneg.f32        d7,  d7                 @  R*s-I*c
210
        vld2.32         {d0, d2}, [r7,:128]!    @ in4u0,in4u1 in2d1,in2d0
211
        vrev64.32       q9,  q9                 @ in4d0,in4d1 in3d0,in3d1
212
        vld2.32         {d1, d3}, [r2,:128]!    @ in3u0,in3u1 in1d1,in1d0
213
        vsub.f32        d0,  d18, d0            @ in4d-in4u      I
214
        vld2.32         {d20,d21},[r4,:128]!    @ c0,c1 s0,s1
215
        vrev64.32       q1,  q1                 @ in2d0,in2d1 in1d0,in1d1
216
        vld2.32         {d30,d31},[r5,:128],r12 @ c2,c3 s2,s3
217
        vadd.f32        d1,  d1,  d19           @ in3u+in3d     -R
218
        vsub.f32        d16, d16, d2            @ in0u-in2d      R
219
        vadd.f32        d17, d17, d3            @ in2u+in1d     -I
220
        uxth            r12, r6,  ror #16
221
        uxth            r6,  r6
222
        add             r12, r1,  r12, lsl #3
223
        add             r6,  r1,  r6,  lsl #3
224
        vst2.32         {d6[0],d7[0]}, [r6,:64]
225
        vst2.32         {d6[1],d7[1]}, [r12,:64]
226
        uxth            r6,  r10, ror #16
227
        uxth            r10, r10
228
        add             r6 , r1,  r6,  lsl #3
229
        add             r10, r1,  r10, lsl #3
230
        vst2.32         {d24[0],d25[0]},[r10,:64]
231
        vst2.32         {d24[1],d25[1]},[r6,:64]
232
        b               1b
233
1:
234
        vneg.f32        d7,  d7                 @  R*s-I*c
235
        uxth            r12, r6,  ror #16
236
        uxth            r6,  r6
237
        add             r12, r1,  r12, lsl #3
238
        add             r6,  r1,  r6,  lsl #3
239
        vst2.32         {d6[0],d7[0]}, [r6,:64]
240
        vst2.32         {d6[1],d7[1]}, [r12,:64]
241
        uxth            r6,  r10, ror #16
242
        uxth            r10, r10
243
        add             r6 , r1,  r6,  lsl #3
244
        add             r10, r1,  r10, lsl #3
245
        vst2.32         {d24[0],d25[0]},[r10,:64]
246
        vst2.32         {d24[1],d25[1]},[r6,:64]
247

    
248
        mov             r4,  r0
249
        mov             r6,  r1
250
        bl              ff_fft_calc_neon
251

    
252
        mov             r12, #1
253
        ldr             lr,  [r4, #28]          @ mdct_bits
254
        ldr             r4,  [r4, #32]          @ tcos
255
        lsl             r12, r12, lr            @ n  = 1 << nbits
256
        lsr             lr,  r12, #3            @ n8 = n >> 3
257

    
258
        add             r4,  r4,  lr,  lsl #3
259
        add             r6,  r6,  lr,  lsl #3
260
        sub             r1,  r4,  #16
261
        sub             r3,  r6,  #16
262

    
263
        mov             r7,  #-16
264
        mov             r8,  r6
265
        mov             r0,  r3
266

    
267
        vld2.32         {d0-d1},  [r3,:128], r7 @ d0 =r1,i1 d1 =r0,i0
268
        vld2.32         {d20-d21},[r6,:128]!    @ d20=r2,i2 d21=r3,i3
269
        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
270
1:
271
        subs            lr,  lr,  #2
272
        vmul.f32        d7,  d0,  d18           @ r1*s1,r0*s0
273
        vld2.32         {d17,d19},[r4,:128]!    @ c2,c3 s2,s3
274
        vmul.f32        d4,  d1,  d18           @ i1*s1,i0*s0
275
        vmul.f32        d5,  d21, d19           @ i2*s2,i3*s3
276
        vmul.f32        d6,  d20, d19           @ r2*s2,r3*s3
277
        vmul.f32        d24, d0,  d16           @ r1*c1,r0*c0
278
        vmul.f32        d25, d20, d17           @ r2*c2,r3*c3
279
        vmul.f32        d22, d21, d17           @ i2*c2,i3*c3
280
        vmul.f32        d23, d1,  d16           @ i1*c1,i0*c0
281
        vadd.f32        d4,  d4,  d24           @ i1*s1+r1*c1,i0*s0+r0*c0
282
        vadd.f32        d5,  d5,  d25           @ i2*s2+r2*c2,i3*s3+r3*c3
283
        vsub.f32        d6,  d22, d6            @ i2*c2-r2*s2,i3*c3-r3*s3
284
        vsub.f32        d7,  d23, d7            @ i1*c1-r1*s1,i0*c0-r0*s0
285
        vneg.f32        q2,  q2
286
        beq             1f
287
        vld2.32         {d0-d1},  [r3,:128], r7
288
        vld2.32         {d20-d21},[r6,:128]!
289
        vld2.32         {d16,d18},[r1,:128], r7 @ c1,c0 s1,s0
290
        vrev64.32       q3,  q3
291
        vst2.32         {d4,d6},  [r0,:128], r7
292
        vst2.32         {d5,d7},  [r8,:128]!
293
        b               1b
294
1:
295
        vrev64.32       q3,  q3
296
        vst2.32         {d4,d6},  [r0,:128]
297
        vst2.32         {d5,d7},  [r8,:128]
298

    
299
        pop             {r4-r10,pc}
300
.endfunc