ffmpeg / libavcodec / arm / dsputil_neon.S @ 015f9f1a
History | View | Annotate | Download (40.2 KB)
1 |
/* |
---|---|
2 |
* ARM NEON optimised DSP functions |
3 |
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
4 |
* |
5 |
* This file is part of FFmpeg. |
6 |
* |
7 |
* FFmpeg is free software; you can redistribute it and/or |
8 |
* modify it under the terms of the GNU Lesser General Public |
9 |
* License as published by the Free Software Foundation; either |
10 |
* version 2.1 of the License, or (at your option) any later version. |
11 |
* |
12 |
* FFmpeg is distributed in the hope that it will be useful, |
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 |
* Lesser General Public License for more details. |
16 |
* |
17 |
* You should have received a copy of the GNU Lesser General Public |
18 |
* License along with FFmpeg; if not, write to the Free Software |
19 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 |
*/ |
21 |
|
22 |
#include "config.h" |
23 |
#include "asm.S" |
24 |
|
25 |
preserve8 |
26 |
.text |
27 |
|
28 |
function ff_clear_block_neon, export=1 |
29 |
vmov.i16 q0, #0 |
30 |
.rept 8 |
31 |
vst1.16 {q0}, [r0,:128]! |
32 |
.endr |
33 |
bx lr |
34 |
endfunc |
35 |
|
36 |
function ff_clear_blocks_neon, export=1 |
37 |
vmov.i16 q0, #0 |
38 |
.rept 8*6 |
39 |
vst1.16 {q0}, [r0,:128]! |
40 |
.endr |
41 |
bx lr |
42 |
endfunc |
43 |
|
44 |
.macro pixels16 avg=0 |
45 |
.if \avg |
46 |
mov ip, r0 |
47 |
.endif |
48 |
1: vld1.64 {d0, d1}, [r1], r2 |
49 |
vld1.64 {d2, d3}, [r1], r2 |
50 |
vld1.64 {d4, d5}, [r1], r2 |
51 |
pld [r1, r2, lsl #2] |
52 |
vld1.64 {d6, d7}, [r1], r2 |
53 |
pld [r1] |
54 |
pld [r1, r2] |
55 |
pld [r1, r2, lsl #1] |
56 |
.if \avg |
57 |
vld1.64 {d16,d17}, [ip,:128], r2 |
58 |
vrhadd.u8 q0, q0, q8 |
59 |
vld1.64 {d18,d19}, [ip,:128], r2 |
60 |
vrhadd.u8 q1, q1, q9 |
61 |
vld1.64 {d20,d21}, [ip,:128], r2 |
62 |
vrhadd.u8 q2, q2, q10 |
63 |
vld1.64 {d22,d23}, [ip,:128], r2 |
64 |
vrhadd.u8 q3, q3, q11 |
65 |
.endif |
66 |
subs r3, r3, #4 |
67 |
vst1.64 {d0, d1}, [r0,:128], r2 |
68 |
vst1.64 {d2, d3}, [r0,:128], r2 |
69 |
vst1.64 {d4, d5}, [r0,:128], r2 |
70 |
vst1.64 {d6, d7}, [r0,:128], r2 |
71 |
bne 1b |
72 |
bx lr |
73 |
.endm |
74 |
|
75 |
.macro pixels16_x2 vhadd=vrhadd.u8 |
76 |
1: vld1.64 {d0-d2}, [r1], r2 |
77 |
vld1.64 {d4-d6}, [r1], r2 |
78 |
pld [r1] |
79 |
pld [r1, r2] |
80 |
subs r3, r3, #2 |
81 |
vext.8 q1, q0, q1, #1 |
82 |
\vhadd q0, q0, q1 |
83 |
vext.8 q3, q2, q3, #1 |
84 |
\vhadd q2, q2, q3 |
85 |
vst1.64 {d0, d1}, [r0,:128], r2 |
86 |
vst1.64 {d4, d5}, [r0,:128], r2 |
87 |
bne 1b |
88 |
bx lr |
89 |
.endm |
90 |
|
91 |
.macro pixels16_y2 vhadd=vrhadd.u8 |
92 |
vld1.64 {d0, d1}, [r1], r2 |
93 |
vld1.64 {d2, d3}, [r1], r2 |
94 |
1: subs r3, r3, #2 |
95 |
\vhadd q2, q0, q1 |
96 |
vld1.64 {d0, d1}, [r1], r2 |
97 |
\vhadd q3, q0, q1 |
98 |
vld1.64 {d2, d3}, [r1], r2 |
99 |
pld [r1] |
100 |
pld [r1, r2] |
101 |
vst1.64 {d4, d5}, [r0,:128], r2 |
102 |
vst1.64 {d6, d7}, [r0,:128], r2 |
103 |
bne 1b |
104 |
bx lr |
105 |
.endm |
106 |
|
107 |
.macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 |
108 |
vld1.64 {d0-d2}, [r1], r2 |
109 |
vld1.64 {d4-d6}, [r1], r2 |
110 |
.if \no_rnd |
111 |
vmov.i16 q13, #1 |
112 |
.endif |
113 |
pld [r1] |
114 |
pld [r1, r2] |
115 |
vext.8 q1, q0, q1, #1 |
116 |
vext.8 q3, q2, q3, #1 |
117 |
vaddl.u8 q8, d0, d2 |
118 |
vaddl.u8 q10, d1, d3 |
119 |
vaddl.u8 q9, d4, d6 |
120 |
vaddl.u8 q11, d5, d7 |
121 |
1: subs r3, r3, #2 |
122 |
vld1.64 {d0-d2}, [r1], r2 |
123 |
vadd.u16 q12, q8, q9 |
124 |
pld [r1] |
125 |
.if \no_rnd |
126 |
vadd.u16 q12, q12, q13 |
127 |
.endif |
128 |
vext.8 q15, q0, q1, #1 |
129 |
vadd.u16 q1 , q10, q11 |
130 |
\vshrn d28, q12, #2 |
131 |
.if \no_rnd |
132 |
vadd.u16 q1, q1, q13 |
133 |
.endif |
134 |
\vshrn d29, q1, #2 |
135 |
vaddl.u8 q8, d0, d30 |
136 |
vld1.64 {d2-d4}, [r1], r2 |
137 |
vaddl.u8 q10, d1, d31 |
138 |
vst1.64 {d28,d29}, [r0,:128], r2 |
139 |
vadd.u16 q12, q8, q9 |
140 |
pld [r1, r2] |
141 |
.if \no_rnd |
142 |
vadd.u16 q12, q12, q13 |
143 |
.endif |
144 |
vext.8 q2, q1, q2, #1 |
145 |
vadd.u16 q0, q10, q11 |
146 |
\vshrn d30, q12, #2 |
147 |
.if \no_rnd |
148 |
vadd.u16 q0, q0, q13 |
149 |
.endif |
150 |
\vshrn d31, q0, #2 |
151 |
vaddl.u8 q9, d2, d4 |
152 |
vaddl.u8 q11, d3, d5 |
153 |
vst1.64 {d30,d31}, [r0,:128], r2 |
154 |
bgt 1b |
155 |
bx lr |
156 |
.endm |
157 |
|
158 |
.macro pixels8 avg=0 |
159 |
1: vld1.64 {d0}, [r1], r2 |
160 |
vld1.64 {d1}, [r1], r2 |
161 |
vld1.64 {d2}, [r1], r2 |
162 |
pld [r1, r2, lsl #2] |
163 |
vld1.64 {d3}, [r1], r2 |
164 |
pld [r1] |
165 |
pld [r1, r2] |
166 |
pld [r1, r2, lsl #1] |
167 |
.if \avg |
168 |
vld1.64 {d4}, [r0,:64], r2 |
169 |
vrhadd.u8 d0, d0, d4 |
170 |
vld1.64 {d5}, [r0,:64], r2 |
171 |
vrhadd.u8 d1, d1, d5 |
172 |
vld1.64 {d6}, [r0,:64], r2 |
173 |
vrhadd.u8 d2, d2, d6 |
174 |
vld1.64 {d7}, [r0,:64], r2 |
175 |
vrhadd.u8 d3, d3, d7 |
176 |
sub r0, r0, r2, lsl #2 |
177 |
.endif |
178 |
subs r3, r3, #4 |
179 |
vst1.64 {d0}, [r0,:64], r2 |
180 |
vst1.64 {d1}, [r0,:64], r2 |
181 |
vst1.64 {d2}, [r0,:64], r2 |
182 |
vst1.64 {d3}, [r0,:64], r2 |
183 |
bne 1b |
184 |
bx lr |
185 |
.endm |
186 |
|
187 |
.macro pixels8_x2 vhadd=vrhadd.u8 |
188 |
1: vld1.64 {d0, d1}, [r1], r2 |
189 |
vext.8 d1, d0, d1, #1 |
190 |
vld1.64 {d2, d3}, [r1], r2 |
191 |
vext.8 d3, d2, d3, #1 |
192 |
pld [r1] |
193 |
pld [r1, r2] |
194 |
subs r3, r3, #2 |
195 |
vswp d1, d2 |
196 |
\vhadd q0, q0, q1 |
197 |
vst1.64 {d0}, [r0,:64], r2 |
198 |
vst1.64 {d1}, [r0,:64], r2 |
199 |
bne 1b |
200 |
bx lr |
201 |
.endm |
202 |
|
203 |
.macro pixels8_y2 vhadd=vrhadd.u8 |
204 |
vld1.64 {d0}, [r1], r2 |
205 |
vld1.64 {d1}, [r1], r2 |
206 |
1: subs r3, r3, #2 |
207 |
\vhadd d4, d0, d1 |
208 |
vld1.64 {d0}, [r1], r2 |
209 |
\vhadd d5, d0, d1 |
210 |
vld1.64 {d1}, [r1], r2 |
211 |
pld [r1] |
212 |
pld [r1, r2] |
213 |
vst1.64 {d4}, [r0,:64], r2 |
214 |
vst1.64 {d5}, [r0,:64], r2 |
215 |
bne 1b |
216 |
bx lr |
217 |
.endm |
218 |
|
219 |
.macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 |
220 |
vld1.64 {d0, d1}, [r1], r2 |
221 |
vld1.64 {d2, d3}, [r1], r2 |
222 |
.if \no_rnd |
223 |
vmov.i16 q11, #1 |
224 |
.endif |
225 |
pld [r1] |
226 |
pld [r1, r2] |
227 |
vext.8 d4, d0, d1, #1 |
228 |
vext.8 d6, d2, d3, #1 |
229 |
vaddl.u8 q8, d0, d4 |
230 |
vaddl.u8 q9, d2, d6 |
231 |
1: subs r3, r3, #2 |
232 |
vld1.64 {d0, d1}, [r1], r2 |
233 |
pld [r1] |
234 |
vadd.u16 q10, q8, q9 |
235 |
vext.8 d4, d0, d1, #1 |
236 |
.if \no_rnd |
237 |
vadd.u16 q10, q10, q11 |
238 |
.endif |
239 |
vaddl.u8 q8, d0, d4 |
240 |
\vshrn d5, q10, #2 |
241 |
vld1.64 {d2, d3}, [r1], r2 |
242 |
vadd.u16 q10, q8, q9 |
243 |
pld [r1, r2] |
244 |
.if \no_rnd |
245 |
vadd.u16 q10, q10, q11 |
246 |
.endif |
247 |
vst1.64 {d5}, [r0,:64], r2 |
248 |
\vshrn d7, q10, #2 |
249 |
vext.8 d6, d2, d3, #1 |
250 |
vaddl.u8 q9, d2, d6 |
251 |
vst1.64 {d7}, [r0,:64], r2 |
252 |
bgt 1b |
253 |
bx lr |
254 |
.endm |
255 |
|
256 |
.macro pixfunc pfx name suf rnd_op args:vararg |
257 |
function ff_\pfx\name\suf\()_neon, export=1 |
258 |
\name \rnd_op \args |
259 |
endfunc |
260 |
.endm |
261 |
|
262 |
.macro pixfunc2 pfx name args:vararg |
263 |
pixfunc \pfx \name |
264 |
pixfunc \pfx \name \args |
265 |
.endm |
266 |
|
267 |
function ff_put_h264_qpel16_mc00_neon, export=1 |
268 |
mov r3, #16 |
269 |
endfunc |
270 |
|
271 |
pixfunc put_ pixels16 |
272 |
pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 |
273 |
pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 |
274 |
pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 |
275 |
|
276 |
function ff_avg_h264_qpel16_mc00_neon, export=1 |
277 |
mov r3, #16 |
278 |
endfunc |
279 |
|
280 |
pixfunc avg_ pixels16,, 1 |
281 |
|
282 |
function ff_put_h264_qpel8_mc00_neon, export=1 |
283 |
mov r3, #8 |
284 |
endfunc |
285 |
|
286 |
pixfunc put_ pixels8 |
287 |
pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 |
288 |
pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 |
289 |
pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 |
290 |
|
291 |
function ff_avg_h264_qpel8_mc00_neon, export=1 |
292 |
mov r3, #8 |
293 |
endfunc |
294 |
|
295 |
pixfunc avg_ pixels8,, 1 |
296 |
|
297 |
function ff_put_pixels_clamped_neon, export=1 |
298 |
vld1.64 {d16-d19}, [r0,:128]! |
299 |
vqmovun.s16 d0, q8 |
300 |
vld1.64 {d20-d23}, [r0,:128]! |
301 |
vqmovun.s16 d1, q9 |
302 |
vld1.64 {d24-d27}, [r0,:128]! |
303 |
vqmovun.s16 d2, q10 |
304 |
vld1.64 {d28-d31}, [r0,:128]! |
305 |
vqmovun.s16 d3, q11 |
306 |
vst1.64 {d0}, [r1,:64], r2 |
307 |
vqmovun.s16 d4, q12 |
308 |
vst1.64 {d1}, [r1,:64], r2 |
309 |
vqmovun.s16 d5, q13 |
310 |
vst1.64 {d2}, [r1,:64], r2 |
311 |
vqmovun.s16 d6, q14 |
312 |
vst1.64 {d3}, [r1,:64], r2 |
313 |
vqmovun.s16 d7, q15 |
314 |
vst1.64 {d4}, [r1,:64], r2 |
315 |
vst1.64 {d5}, [r1,:64], r2 |
316 |
vst1.64 {d6}, [r1,:64], r2 |
317 |
vst1.64 {d7}, [r1,:64], r2 |
318 |
bx lr |
319 |
endfunc |
320 |
|
321 |
function ff_put_signed_pixels_clamped_neon, export=1 |
322 |
vmov.u8 d31, #128 |
323 |
vld1.64 {d16-d17}, [r0,:128]! |
324 |
vqmovn.s16 d0, q8 |
325 |
vld1.64 {d18-d19}, [r0,:128]! |
326 |
vqmovn.s16 d1, q9 |
327 |
vld1.64 {d16-d17}, [r0,:128]! |
328 |
vqmovn.s16 d2, q8 |
329 |
vld1.64 {d18-d19}, [r0,:128]! |
330 |
vadd.u8 d0, d0, d31 |
331 |
vld1.64 {d20-d21}, [r0,:128]! |
332 |
vadd.u8 d1, d1, d31 |
333 |
vld1.64 {d22-d23}, [r0,:128]! |
334 |
vadd.u8 d2, d2, d31 |
335 |
vst1.64 {d0}, [r1,:64], r2 |
336 |
vqmovn.s16 d3, q9 |
337 |
vst1.64 {d1}, [r1,:64], r2 |
338 |
vqmovn.s16 d4, q10 |
339 |
vst1.64 {d2}, [r1,:64], r2 |
340 |
vqmovn.s16 d5, q11 |
341 |
vld1.64 {d24-d25}, [r0,:128]! |
342 |
vadd.u8 d3, d3, d31 |
343 |
vld1.64 {d26-d27}, [r0,:128]! |
344 |
vadd.u8 d4, d4, d31 |
345 |
vadd.u8 d5, d5, d31 |
346 |
vst1.64 {d3}, [r1,:64], r2 |
347 |
vqmovn.s16 d6, q12 |
348 |
vst1.64 {d4}, [r1,:64], r2 |
349 |
vqmovn.s16 d7, q13 |
350 |
vst1.64 {d5}, [r1,:64], r2 |
351 |
vadd.u8 d6, d6, d31 |
352 |
vadd.u8 d7, d7, d31 |
353 |
vst1.64 {d6}, [r1,:64], r2 |
354 |
vst1.64 {d7}, [r1,:64], r2 |
355 |
bx lr |
356 |
endfunc |
357 |
|
358 |
function ff_add_pixels_clamped_neon, export=1 |
359 |
mov r3, r1 |
360 |
vld1.64 {d16}, [r1,:64], r2 |
361 |
vld1.64 {d0-d1}, [r0,:128]! |
362 |
vaddw.u8 q0, q0, d16 |
363 |
vld1.64 {d17}, [r1,:64], r2 |
364 |
vld1.64 {d2-d3}, [r0,:128]! |
365 |
vqmovun.s16 d0, q0 |
366 |
vld1.64 {d18}, [r1,:64], r2 |
367 |
vaddw.u8 q1, q1, d17 |
368 |
vld1.64 {d4-d5}, [r0,:128]! |
369 |
vaddw.u8 q2, q2, d18 |
370 |
vst1.64 {d0}, [r3,:64], r2 |
371 |
vqmovun.s16 d2, q1 |
372 |
vld1.64 {d19}, [r1,:64], r2 |
373 |
vld1.64 {d6-d7}, [r0,:128]! |
374 |
vaddw.u8 q3, q3, d19 |
375 |
vqmovun.s16 d4, q2 |
376 |
vst1.64 {d2}, [r3,:64], r2 |
377 |
vld1.64 {d16}, [r1,:64], r2 |
378 |
vqmovun.s16 d6, q3 |
379 |
vld1.64 {d0-d1}, [r0,:128]! |
380 |
vaddw.u8 q0, q0, d16 |
381 |
vst1.64 {d4}, [r3,:64], r2 |
382 |
vld1.64 {d17}, [r1,:64], r2 |
383 |
vld1.64 {d2-d3}, [r0,:128]! |
384 |
vaddw.u8 q1, q1, d17 |
385 |
vst1.64 {d6}, [r3,:64], r2 |
386 |
vqmovun.s16 d0, q0 |
387 |
vld1.64 {d18}, [r1,:64], r2 |
388 |
vld1.64 {d4-d5}, [r0,:128]! |
389 |
vaddw.u8 q2, q2, d18 |
390 |
vst1.64 {d0}, [r3,:64], r2 |
391 |
vqmovun.s16 d2, q1 |
392 |
vld1.64 {d19}, [r1,:64], r2 |
393 |
vqmovun.s16 d4, q2 |
394 |
vld1.64 {d6-d7}, [r0,:128]! |
395 |
vaddw.u8 q3, q3, d19 |
396 |
vst1.64 {d2}, [r3,:64], r2 |
397 |
vqmovun.s16 d6, q3 |
398 |
vst1.64 {d4}, [r3,:64], r2 |
399 |
vst1.64 {d6}, [r3,:64], r2 |
400 |
bx lr |
401 |
endfunc |
402 |
|
403 |
function ff_float_to_int16_neon, export=1 |
404 |
subs r2, r2, #8 |
405 |
vld1.64 {d0-d1}, [r1,:128]! |
406 |
vcvt.s32.f32 q8, q0, #16 |
407 |
vld1.64 {d2-d3}, [r1,:128]! |
408 |
vcvt.s32.f32 q9, q1, #16 |
409 |
beq 3f |
410 |
bics ip, r2, #15 |
411 |
beq 2f |
412 |
1: subs ip, ip, #16 |
413 |
vshrn.s32 d4, q8, #16 |
414 |
vld1.64 {d0-d1}, [r1,:128]! |
415 |
vcvt.s32.f32 q0, q0, #16 |
416 |
vshrn.s32 d5, q9, #16 |
417 |
vld1.64 {d2-d3}, [r1,:128]! |
418 |
vcvt.s32.f32 q1, q1, #16 |
419 |
vshrn.s32 d6, q0, #16 |
420 |
vst1.64 {d4-d5}, [r0,:128]! |
421 |
vshrn.s32 d7, q1, #16 |
422 |
vld1.64 {d16-d17},[r1,:128]! |
423 |
vcvt.s32.f32 q8, q8, #16 |
424 |
vld1.64 {d18-d19},[r1,:128]! |
425 |
vcvt.s32.f32 q9, q9, #16 |
426 |
vst1.64 {d6-d7}, [r0,:128]! |
427 |
bne 1b |
428 |
ands r2, r2, #15 |
429 |
beq 3f |
430 |
2: vld1.64 {d0-d1}, [r1,:128]! |
431 |
vshrn.s32 d4, q8, #16 |
432 |
vcvt.s32.f32 q0, q0, #16 |
433 |
vld1.64 {d2-d3}, [r1,:128]! |
434 |
vshrn.s32 d5, q9, #16 |
435 |
vcvt.s32.f32 q1, q1, #16 |
436 |
vshrn.s32 d6, q0, #16 |
437 |
vst1.64 {d4-d5}, [r0,:128]! |
438 |
vshrn.s32 d7, q1, #16 |
439 |
vst1.64 {d6-d7}, [r0,:128]! |
440 |
bx lr |
441 |
3: vshrn.s32 d4, q8, #16 |
442 |
vshrn.s32 d5, q9, #16 |
443 |
vst1.64 {d4-d5}, [r0,:128]! |
444 |
bx lr |
445 |
endfunc |
446 |
|
447 |
function ff_float_to_int16_interleave_neon, export=1 |
448 |
cmp r3, #2 |
449 |
ldrlt r1, [r1] |
450 |
blt ff_float_to_int16_neon |
451 |
bne 4f |
452 |
|
453 |
ldr r3, [r1] |
454 |
ldr r1, [r1, #4] |
455 |
|
456 |
subs r2, r2, #8 |
457 |
vld1.64 {d0-d1}, [r3,:128]! |
458 |
vcvt.s32.f32 q8, q0, #16 |
459 |
vld1.64 {d2-d3}, [r3,:128]! |
460 |
vcvt.s32.f32 q9, q1, #16 |
461 |
vld1.64 {d20-d21},[r1,:128]! |
462 |
vcvt.s32.f32 q10, q10, #16 |
463 |
vld1.64 {d22-d23},[r1,:128]! |
464 |
vcvt.s32.f32 q11, q11, #16 |
465 |
beq 3f |
466 |
bics ip, r2, #15 |
467 |
beq 2f |
468 |
1: subs ip, ip, #16 |
469 |
vld1.64 {d0-d1}, [r3,:128]! |
470 |
vcvt.s32.f32 q0, q0, #16 |
471 |
vsri.32 q10, q8, #16 |
472 |
vld1.64 {d2-d3}, [r3,:128]! |
473 |
vcvt.s32.f32 q1, q1, #16 |
474 |
vld1.64 {d24-d25},[r1,:128]! |
475 |
vcvt.s32.f32 q12, q12, #16 |
476 |
vld1.64 {d26-d27},[r1,:128]! |
477 |
vsri.32 q11, q9, #16 |
478 |
vst1.64 {d20-d21},[r0,:128]! |
479 |
vcvt.s32.f32 q13, q13, #16 |
480 |
vst1.64 {d22-d23},[r0,:128]! |
481 |
vsri.32 q12, q0, #16 |
482 |
vld1.64 {d16-d17},[r3,:128]! |
483 |
vsri.32 q13, q1, #16 |
484 |
vst1.64 {d24-d25},[r0,:128]! |
485 |
vcvt.s32.f32 q8, q8, #16 |
486 |
vld1.64 {d18-d19},[r3,:128]! |
487 |
vcvt.s32.f32 q9, q9, #16 |
488 |
vld1.64 {d20-d21},[r1,:128]! |
489 |
vcvt.s32.f32 q10, q10, #16 |
490 |
vld1.64 {d22-d23},[r1,:128]! |
491 |
vcvt.s32.f32 q11, q11, #16 |
492 |
vst1.64 {d26-d27},[r0,:128]! |
493 |
bne 1b |
494 |
ands r2, r2, #15 |
495 |
beq 3f |
496 |
2: vsri.32 q10, q8, #16 |
497 |
vld1.64 {d0-d1}, [r3,:128]! |
498 |
vcvt.s32.f32 q0, q0, #16 |
499 |
vld1.64 {d2-d3}, [r3,:128]! |
500 |
vcvt.s32.f32 q1, q1, #16 |
501 |
vld1.64 {d24-d25},[r1,:128]! |
502 |
vcvt.s32.f32 q12, q12, #16 |
503 |
vsri.32 q11, q9, #16 |
504 |
vld1.64 {d26-d27},[r1,:128]! |
505 |
vcvt.s32.f32 q13, q13, #16 |
506 |
vst1.64 {d20-d21},[r0,:128]! |
507 |
vsri.32 q12, q0, #16 |
508 |
vst1.64 {d22-d23},[r0,:128]! |
509 |
vsri.32 q13, q1, #16 |
510 |
vst1.64 {d24-d27},[r0,:128]! |
511 |
bx lr |
512 |
3: vsri.32 q10, q8, #16 |
513 |
vsri.32 q11, q9, #16 |
514 |
vst1.64 {d20-d23},[r0,:128]! |
515 |
bx lr |
516 |
|
517 |
4: push {r4-r8,lr} |
518 |
cmp r3, #4 |
519 |
lsl ip, r3, #1 |
520 |
blt 4f |
521 |
|
522 |
@ 4 channels |
523 |
5: ldmia r1!, {r4-r7} |
524 |
mov lr, r2 |
525 |
mov r8, r0 |
526 |
vld1.64 {d16-d17},[r4,:128]! |
527 |
vcvt.s32.f32 q8, q8, #16 |
528 |
vld1.64 {d18-d19},[r5,:128]! |
529 |
vcvt.s32.f32 q9, q9, #16 |
530 |
vld1.64 {d20-d21},[r6,:128]! |
531 |
vcvt.s32.f32 q10, q10, #16 |
532 |
vld1.64 {d22-d23},[r7,:128]! |
533 |
vcvt.s32.f32 q11, q11, #16 |
534 |
6: subs lr, lr, #8 |
535 |
vld1.64 {d0-d1}, [r4,:128]! |
536 |
vcvt.s32.f32 q0, q0, #16 |
537 |
vsri.32 q9, q8, #16 |
538 |
vld1.64 {d2-d3}, [r5,:128]! |
539 |
vcvt.s32.f32 q1, q1, #16 |
540 |
vsri.32 q11, q10, #16 |
541 |
vld1.64 {d4-d5}, [r6,:128]! |
542 |
vcvt.s32.f32 q2, q2, #16 |
543 |
vzip.32 d18, d22 |
544 |
vld1.64 {d6-d7}, [r7,:128]! |
545 |
vcvt.s32.f32 q3, q3, #16 |
546 |
vzip.32 d19, d23 |
547 |
vst1.64 {d18}, [r8], ip |
548 |
vsri.32 q1, q0, #16 |
549 |
vst1.64 {d22}, [r8], ip |
550 |
vsri.32 q3, q2, #16 |
551 |
vst1.64 {d19}, [r8], ip |
552 |
vzip.32 d2, d6 |
553 |
vst1.64 {d23}, [r8], ip |
554 |
vzip.32 d3, d7 |
555 |
beq 7f |
556 |
vld1.64 {d16-d17},[r4,:128]! |
557 |
vcvt.s32.f32 q8, q8, #16 |
558 |
vst1.64 {d2}, [r8], ip |
559 |
vld1.64 {d18-d19},[r5,:128]! |
560 |
vcvt.s32.f32 q9, q9, #16 |
561 |
vst1.64 {d6}, [r8], ip |
562 |
vld1.64 {d20-d21},[r6,:128]! |
563 |
vcvt.s32.f32 q10, q10, #16 |
564 |
vst1.64 {d3}, [r8], ip |
565 |
vld1.64 {d22-d23},[r7,:128]! |
566 |
vcvt.s32.f32 q11, q11, #16 |
567 |
vst1.64 {d7}, [r8], ip |
568 |
b 6b |
569 |
7: vst1.64 {d2}, [r8], ip |
570 |
vst1.64 {d6}, [r8], ip |
571 |
vst1.64 {d3}, [r8], ip |
572 |
vst1.64 {d7}, [r8], ip |
573 |
subs r3, r3, #4 |
574 |
popeq {r4-r8,pc} |
575 |
cmp r3, #4 |
576 |
add r0, r0, #8 |
577 |
bge 5b |
578 |
|
579 |
@ 2 channels |
580 |
4: cmp r3, #2 |
581 |
blt 4f |
582 |
ldmia r1!, {r4-r5} |
583 |
mov lr, r2 |
584 |
mov r8, r0 |
585 |
tst lr, #8 |
586 |
vld1.64 {d16-d17},[r4,:128]! |
587 |
vcvt.s32.f32 q8, q8, #16 |
588 |
vld1.64 {d18-d19},[r5,:128]! |
589 |
vcvt.s32.f32 q9, q9, #16 |
590 |
vld1.64 {d20-d21},[r4,:128]! |
591 |
vcvt.s32.f32 q10, q10, #16 |
592 |
vld1.64 {d22-d23},[r5,:128]! |
593 |
vcvt.s32.f32 q11, q11, #16 |
594 |
beq 6f |
595 |
subs lr, lr, #8 |
596 |
beq 7f |
597 |
vsri.32 d18, d16, #16 |
598 |
vsri.32 d19, d17, #16 |
599 |
vld1.64 {d16-d17},[r4,:128]! |
600 |
vcvt.s32.f32 q8, q8, #16 |
601 |
vst1.32 {d18[0]}, [r8], ip |
602 |
vsri.32 d22, d20, #16 |
603 |
vst1.32 {d18[1]}, [r8], ip |
604 |
vsri.32 d23, d21, #16 |
605 |
vst1.32 {d19[0]}, [r8], ip |
606 |
vst1.32 {d19[1]}, [r8], ip |
607 |
vld1.64 {d18-d19},[r5,:128]! |
608 |
vcvt.s32.f32 q9, q9, #16 |
609 |
vst1.32 {d22[0]}, [r8], ip |
610 |
vst1.32 {d22[1]}, [r8], ip |
611 |
vld1.64 {d20-d21},[r4,:128]! |
612 |
vcvt.s32.f32 q10, q10, #16 |
613 |
vst1.32 {d23[0]}, [r8], ip |
614 |
vst1.32 {d23[1]}, [r8], ip |
615 |
vld1.64 {d22-d23},[r5,:128]! |
616 |
vcvt.s32.f32 q11, q11, #16 |
617 |
6: subs lr, lr, #16 |
618 |
vld1.64 {d0-d1}, [r4,:128]! |
619 |
vcvt.s32.f32 q0, q0, #16 |
620 |
vsri.32 d18, d16, #16 |
621 |
vld1.64 {d2-d3}, [r5,:128]! |
622 |
vcvt.s32.f32 q1, q1, #16 |
623 |
vsri.32 d19, d17, #16 |
624 |
vld1.64 {d4-d5}, [r4,:128]! |
625 |
vcvt.s32.f32 q2, q2, #16 |
626 |
vld1.64 {d6-d7}, [r5,:128]! |
627 |
vcvt.s32.f32 q3, q3, #16 |
628 |
vst1.32 {d18[0]}, [r8], ip |
629 |
vsri.32 d22, d20, #16 |
630 |
vst1.32 {d18[1]}, [r8], ip |
631 |
vsri.32 d23, d21, #16 |
632 |
vst1.32 {d19[0]}, [r8], ip |
633 |
vsri.32 d2, d0, #16 |
634 |
vst1.32 {d19[1]}, [r8], ip |
635 |
vsri.32 d3, d1, #16 |
636 |
vst1.32 {d22[0]}, [r8], ip |
637 |
vsri.32 d6, d4, #16 |
638 |
vst1.32 {d22[1]}, [r8], ip |
639 |
vsri.32 d7, d5, #16 |
640 |
vst1.32 {d23[0]}, [r8], ip |
641 |
vst1.32 {d23[1]}, [r8], ip |
642 |
beq 6f |
643 |
vld1.64 {d16-d17},[r4,:128]! |
644 |
vcvt.s32.f32 q8, q8, #16 |
645 |
vst1.32 {d2[0]}, [r8], ip |
646 |
vst1.32 {d2[1]}, [r8], ip |
647 |
vld1.64 {d18-d19},[r5,:128]! |
648 |
vcvt.s32.f32 q9, q9, #16 |
649 |
vst1.32 {d3[0]}, [r8], ip |
650 |
vst1.32 {d3[1]}, [r8], ip |
651 |
vld1.64 {d20-d21},[r4,:128]! |
652 |
vcvt.s32.f32 q10, q10, #16 |
653 |
vst1.32 {d6[0]}, [r8], ip |
654 |
vst1.32 {d6[1]}, [r8], ip |
655 |
vld1.64 {d22-d23},[r5,:128]! |
656 |
vcvt.s32.f32 q11, q11, #16 |
657 |
vst1.32 {d7[0]}, [r8], ip |
658 |
vst1.32 {d7[1]}, [r8], ip |
659 |
bgt 6b |
660 |
6: vst1.32 {d2[0]}, [r8], ip |
661 |
vst1.32 {d2[1]}, [r8], ip |
662 |
vst1.32 {d3[0]}, [r8], ip |
663 |
vst1.32 {d3[1]}, [r8], ip |
664 |
vst1.32 {d6[0]}, [r8], ip |
665 |
vst1.32 {d6[1]}, [r8], ip |
666 |
vst1.32 {d7[0]}, [r8], ip |
667 |
vst1.32 {d7[1]}, [r8], ip |
668 |
b 8f |
669 |
7: vsri.32 d18, d16, #16 |
670 |
vsri.32 d19, d17, #16 |
671 |
vst1.32 {d18[0]}, [r8], ip |
672 |
vsri.32 d22, d20, #16 |
673 |
vst1.32 {d18[1]}, [r8], ip |
674 |
vsri.32 d23, d21, #16 |
675 |
vst1.32 {d19[0]}, [r8], ip |
676 |
vst1.32 {d19[1]}, [r8], ip |
677 |
vst1.32 {d22[0]}, [r8], ip |
678 |
vst1.32 {d22[1]}, [r8], ip |
679 |
vst1.32 {d23[0]}, [r8], ip |
680 |
vst1.32 {d23[1]}, [r8], ip |
681 |
8: subs r3, r3, #2 |
682 |
add r0, r0, #4 |
683 |
popeq {r4-r8,pc} |
684 |
|
685 |
@ 1 channel |
686 |
4: ldr r4, [r1],#4 |
687 |
tst r2, #8 |
688 |
mov lr, r2 |
689 |
mov r5, r0 |
690 |
vld1.64 {d0-d1}, [r4,:128]! |
691 |
vcvt.s32.f32 q0, q0, #16 |
692 |
vld1.64 {d2-d3}, [r4,:128]! |
693 |
vcvt.s32.f32 q1, q1, #16 |
694 |
bne 8f |
695 |
6: subs lr, lr, #16 |
696 |
vld1.64 {d4-d5}, [r4,:128]! |
697 |
vcvt.s32.f32 q2, q2, #16 |
698 |
vld1.64 {d6-d7}, [r4,:128]! |
699 |
vcvt.s32.f32 q3, q3, #16 |
700 |
vst1.16 {d0[1]}, [r5,:16], ip |
701 |
vst1.16 {d0[3]}, [r5,:16], ip |
702 |
vst1.16 {d1[1]}, [r5,:16], ip |
703 |
vst1.16 {d1[3]}, [r5,:16], ip |
704 |
vst1.16 {d2[1]}, [r5,:16], ip |
705 |
vst1.16 {d2[3]}, [r5,:16], ip |
706 |
vst1.16 {d3[1]}, [r5,:16], ip |
707 |
vst1.16 {d3[3]}, [r5,:16], ip |
708 |
beq 7f |
709 |
vld1.64 {d0-d1}, [r4,:128]! |
710 |
vcvt.s32.f32 q0, q0, #16 |
711 |
vld1.64 {d2-d3}, [r4,:128]! |
712 |
vcvt.s32.f32 q1, q1, #16 |
713 |
7: vst1.16 {d4[1]}, [r5,:16], ip |
714 |
vst1.16 {d4[3]}, [r5,:16], ip |
715 |
vst1.16 {d5[1]}, [r5,:16], ip |
716 |
vst1.16 {d5[3]}, [r5,:16], ip |
717 |
vst1.16 {d6[1]}, [r5,:16], ip |
718 |
vst1.16 {d6[3]}, [r5,:16], ip |
719 |
vst1.16 {d7[1]}, [r5,:16], ip |
720 |
vst1.16 {d7[3]}, [r5,:16], ip |
721 |
bgt 6b |
722 |
pop {r4-r8,pc} |
723 |
8: subs lr, lr, #8 |
724 |
vst1.16 {d0[1]}, [r5,:16], ip |
725 |
vst1.16 {d0[3]}, [r5,:16], ip |
726 |
vst1.16 {d1[1]}, [r5,:16], ip |
727 |
vst1.16 {d1[3]}, [r5,:16], ip |
728 |
vst1.16 {d2[1]}, [r5,:16], ip |
729 |
vst1.16 {d2[3]}, [r5,:16], ip |
730 |
vst1.16 {d3[1]}, [r5,:16], ip |
731 |
vst1.16 {d3[3]}, [r5,:16], ip |
732 |
popeq {r4-r8,pc} |
733 |
vld1.64 {d0-d1}, [r4,:128]! |
734 |
vcvt.s32.f32 q0, q0, #16 |
735 |
vld1.64 {d2-d3}, [r4,:128]! |
736 |
vcvt.s32.f32 q1, q1, #16 |
737 |
b 6b |
738 |
endfunc |
739 |
|
740 |
function ff_vector_fmul_neon, export=1 |
741 |
subs r3, r3, #8 |
742 |
vld1.64 {d0-d3}, [r1,:128]! |
743 |
vld1.64 {d4-d7}, [r2,:128]! |
744 |
vmul.f32 q8, q0, q2 |
745 |
vmul.f32 q9, q1, q3 |
746 |
beq 3f |
747 |
bics ip, r3, #15 |
748 |
beq 2f |
749 |
1: subs ip, ip, #16 |
750 |
vld1.64 {d0-d1}, [r1,:128]! |
751 |
vld1.64 {d4-d5}, [r2,:128]! |
752 |
vmul.f32 q10, q0, q2 |
753 |
vld1.64 {d2-d3}, [r1,:128]! |
754 |
vld1.64 {d6-d7}, [r2,:128]! |
755 |
vmul.f32 q11, q1, q3 |
756 |
vst1.64 {d16-d19},[r0,:128]! |
757 |
vld1.64 {d0-d1}, [r1,:128]! |
758 |
vld1.64 {d4-d5}, [r2,:128]! |
759 |
vmul.f32 q8, q0, q2 |
760 |
vld1.64 {d2-d3}, [r1,:128]! |
761 |
vld1.64 {d6-d7}, [r2,:128]! |
762 |
vmul.f32 q9, q1, q3 |
763 |
vst1.64 {d20-d23},[r0,:128]! |
764 |
bne 1b |
765 |
ands r3, r3, #15 |
766 |
beq 3f |
767 |
2: vld1.64 {d0-d1}, [r1,:128]! |
768 |
vld1.64 {d4-d5}, [r2,:128]! |
769 |
vst1.64 {d16-d17},[r0,:128]! |
770 |
vmul.f32 q8, q0, q2 |
771 |
vld1.64 {d2-d3}, [r1,:128]! |
772 |
vld1.64 {d6-d7}, [r2,:128]! |
773 |
vst1.64 {d18-d19},[r0,:128]! |
774 |
vmul.f32 q9, q1, q3 |
775 |
3: vst1.64 {d16-d19},[r0,:128]! |
776 |
bx lr |
777 |
endfunc |
778 |
|
779 |
function ff_vector_fmul_window_neon, export=1 |
780 |
VFP vdup.32 q8, d0[0] |
781 |
NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
782 |
push {r4,r5,lr} |
783 |
VFP ldr lr, [sp, #12] |
784 |
NOVFP ldr lr, [sp, #16] |
785 |
sub r2, r2, #8 |
786 |
sub r5, lr, #2 |
787 |
add r2, r2, r5, lsl #2 |
788 |
add r4, r3, r5, lsl #3 |
789 |
add ip, r0, r5, lsl #3 |
790 |
mov r5, #-16 |
791 |
vld1.64 {d0,d1}, [r1,:128]! |
792 |
vld1.64 {d2,d3}, [r2,:128], r5 |
793 |
vld1.64 {d4,d5}, [r3,:128]! |
794 |
vld1.64 {d6,d7}, [r4,:128], r5 |
795 |
1: subs lr, lr, #4 |
796 |
vmov q11, q8 |
797 |
vmla.f32 d22, d0, d4 |
798 |
vmov q10, q8 |
799 |
vmla.f32 d23, d1, d5 |
800 |
vrev64.32 q3, q3 |
801 |
vmla.f32 d20, d0, d7 |
802 |
vrev64.32 q1, q1 |
803 |
vmla.f32 d21, d1, d6 |
804 |
beq 2f |
805 |
vmla.f32 d22, d3, d7 |
806 |
vld1.64 {d0,d1}, [r1,:128]! |
807 |
vmla.f32 d23, d2, d6 |
808 |
vld1.64 {d18,d19},[r2,:128], r5 |
809 |
vmls.f32 d20, d3, d4 |
810 |
vld1.64 {d24,d25},[r3,:128]! |
811 |
vmls.f32 d21, d2, d5 |
812 |
vld1.64 {d6,d7}, [r4,:128], r5 |
813 |
vmov q1, q9 |
814 |
vrev64.32 q11, q11 |
815 |
vmov q2, q12 |
816 |
vswp d22, d23 |
817 |
vst1.64 {d20,d21},[r0,:128]! |
818 |
vst1.64 {d22,d23},[ip,:128], r5 |
819 |
b 1b |
820 |
2: vmla.f32 d22, d3, d7 |
821 |
vmla.f32 d23, d2, d6 |
822 |
vmls.f32 d20, d3, d4 |
823 |
vmls.f32 d21, d2, d5 |
824 |
vrev64.32 q11, q11 |
825 |
vswp d22, d23 |
826 |
vst1.64 {d20,d21},[r0,:128]! |
827 |
vst1.64 {d22,d23},[ip,:128], r5 |
828 |
pop {r4,r5,pc} |
829 |
endfunc |
830 |
|
831 |
#if CONFIG_VORBIS_DECODER |
832 |
function ff_vorbis_inverse_coupling_neon, export=1 |
833 |
vmov.i32 q10, #1<<31 |
834 |
subs r2, r2, #4 |
835 |
mov r3, r0 |
836 |
mov r12, r1 |
837 |
beq 3f |
838 |
|
839 |
vld1.32 {d24-d25},[r1,:128]! |
840 |
vld1.32 {d22-d23},[r0,:128]! |
841 |
vcle.s32 q8, q12, #0 |
842 |
vand q9, q11, q10 |
843 |
veor q12, q12, q9 |
844 |
vand q2, q12, q8 |
845 |
vbic q3, q12, q8 |
846 |
vadd.f32 q12, q11, q2 |
847 |
vsub.f32 q11, q11, q3 |
848 |
1: vld1.32 {d2-d3}, [r1,:128]! |
849 |
vld1.32 {d0-d1}, [r0,:128]! |
850 |
vcle.s32 q8, q1, #0 |
851 |
vand q9, q0, q10 |
852 |
veor q1, q1, q9 |
853 |
vst1.32 {d24-d25},[r3, :128]! |
854 |
vst1.32 {d22-d23},[r12,:128]! |
855 |
vand q2, q1, q8 |
856 |
vbic q3, q1, q8 |
857 |
vadd.f32 q1, q0, q2 |
858 |
vsub.f32 q0, q0, q3 |
859 |
subs r2, r2, #8 |
860 |
ble 2f |
861 |
vld1.32 {d24-d25},[r1,:128]! |
862 |
vld1.32 {d22-d23},[r0,:128]! |
863 |
vcle.s32 q8, q12, #0 |
864 |
vand q9, q11, q10 |
865 |
veor q12, q12, q9 |
866 |
vst1.32 {d2-d3}, [r3, :128]! |
867 |
vst1.32 {d0-d1}, [r12,:128]! |
868 |
vand q2, q12, q8 |
869 |
vbic q3, q12, q8 |
870 |
vadd.f32 q12, q11, q2 |
871 |
vsub.f32 q11, q11, q3 |
872 |
b 1b |
873 |
|
874 |
2: vst1.32 {d2-d3}, [r3, :128]! |
875 |
vst1.32 {d0-d1}, [r12,:128]! |
876 |
bxlt lr |
877 |
|
878 |
3: vld1.32 {d2-d3}, [r1,:128] |
879 |
vld1.32 {d0-d1}, [r0,:128] |
880 |
vcle.s32 q8, q1, #0 |
881 |
vand q9, q0, q10 |
882 |
veor q1, q1, q9 |
883 |
vand q2, q1, q8 |
884 |
vbic q3, q1, q8 |
885 |
vadd.f32 q1, q0, q2 |
886 |
vsub.f32 q0, q0, q3 |
887 |
vst1.32 {d2-d3}, [r0,:128]! |
888 |
vst1.32 {d0-d1}, [r1,:128]! |
889 |
bx lr |
890 |
endfunc |
891 |
#endif |
892 |
|
893 |
function ff_vector_fmul_scalar_neon, export=1 |
894 |
VFP len .req r2 |
895 |
NOVFP len .req r3 |
896 |
VFP vdup.32 q8, d0[0] |
897 |
NOVFP vdup.32 q8, r2 |
898 |
bics r12, len, #15 |
899 |
beq 3f |
900 |
vld1.32 {q0},[r1,:128]! |
901 |
vld1.32 {q1},[r1,:128]! |
902 |
1: vmul.f32 q0, q0, q8 |
903 |
vld1.32 {q2},[r1,:128]! |
904 |
vmul.f32 q1, q1, q8 |
905 |
vld1.32 {q3},[r1,:128]! |
906 |
vmul.f32 q2, q2, q8 |
907 |
vst1.32 {q0},[r0,:128]! |
908 |
vmul.f32 q3, q3, q8 |
909 |
vst1.32 {q1},[r0,:128]! |
910 |
subs r12, r12, #16 |
911 |
beq 2f |
912 |
vld1.32 {q0},[r1,:128]! |
913 |
vst1.32 {q2},[r0,:128]! |
914 |
vld1.32 {q1},[r1,:128]! |
915 |
vst1.32 {q3},[r0,:128]! |
916 |
b 1b |
917 |
2: vst1.32 {q2},[r0,:128]! |
918 |
vst1.32 {q3},[r0,:128]! |
919 |
ands len, len, #15 |
920 |
bxeq lr |
921 |
3: vld1.32 {q0},[r1,:128]! |
922 |
vmul.f32 q0, q0, q8 |
923 |
vst1.32 {q0},[r0,:128]! |
924 |
subs len, len, #4 |
925 |
bgt 3b |
926 |
bx lr |
927 |
.unreq len |
928 |
endfunc |
929 |
|
930 |
function ff_vector_fmul_sv_scalar_2_neon, export=1 |
931 |
VFP vdup.32 d16, d0[0] |
932 |
NOVFP vdup.32 d16, r3 |
933 |
NOVFP ldr r3, [sp] |
934 |
vld1.32 {d0},[r1,:64]! |
935 |
vld1.32 {d1},[r1,:64]! |
936 |
1: subs r3, r3, #4 |
937 |
vmul.f32 d4, d0, d16 |
938 |
vmul.f32 d5, d1, d16 |
939 |
ldr r12, [r2], #4 |
940 |
vld1.32 {d2},[r12,:64] |
941 |
ldr r12, [r2], #4 |
942 |
vld1.32 {d3},[r12,:64] |
943 |
vmul.f32 d4, d4, d2 |
944 |
vmul.f32 d5, d5, d3 |
945 |
beq 2f |
946 |
vld1.32 {d0},[r1,:64]! |
947 |
vld1.32 {d1},[r1,:64]! |
948 |
vst1.32 {d4},[r0,:64]! |
949 |
vst1.32 {d5},[r0,:64]! |
950 |
b 1b |
951 |
2: vst1.32 {d4},[r0,:64]! |
952 |
vst1.32 {d5},[r0,:64]! |
953 |
bx lr |
954 |
endfunc |
955 |
|
956 |
function ff_vector_fmul_sv_scalar_4_neon, export=1 |
957 |
VFP vdup.32 q10, d0[0] |
958 |
NOVFP vdup.32 q10, r3 |
959 |
NOVFP ldr r3, [sp] |
960 |
push {lr} |
961 |
bics lr, r3, #7 |
962 |
beq 3f |
963 |
vld1.32 {q0},[r1,:128]! |
964 |
vld1.32 {q2},[r1,:128]! |
965 |
1: ldr r12, [r2], #4 |
966 |
vld1.32 {q1},[r12,:128] |
967 |
ldr r12, [r2], #4 |
968 |
vld1.32 {q3},[r12,:128] |
969 |
vmul.f32 q8, q0, q10 |
970 |
vmul.f32 q8, q8, q1 |
971 |
vmul.f32 q9, q2, q10 |
972 |
vmul.f32 q9, q9, q3 |
973 |
subs lr, lr, #8 |
974 |
beq 2f |
975 |
vld1.32 {q0},[r1,:128]! |
976 |
vld1.32 {q2},[r1,:128]! |
977 |
vst1.32 {q8},[r0,:128]! |
978 |
vst1.32 {q9},[r0,:128]! |
979 |
b 1b |
980 |
2: vst1.32 {q8},[r0,:128]! |
981 |
vst1.32 {q9},[r0,:128]! |
982 |
ands r3, r3, #7 |
983 |
popeq {pc} |
984 |
3: vld1.32 {q0},[r1,:128]! |
985 |
ldr r12, [r2], #4 |
986 |
vld1.32 {q1},[r12,:128] |
987 |
vmul.f32 q0, q0, q10 |
988 |
vmul.f32 q0, q0, q1 |
989 |
vst1.32 {q0},[r0,:128]! |
990 |
subs r3, r3, #4 |
991 |
bgt 3b |
992 |
pop {pc} |
993 |
endfunc |
994 |
|
995 |
function ff_sv_fmul_scalar_2_neon, export=1 |
996 |
VFP len .req r2 |
997 |
NOVFP len .req r3 |
998 |
VFP vdup.32 q8, d0[0] |
999 |
NOVFP vdup.32 q8, r2 |
1000 |
ldr r12, [r1], #4 |
1001 |
vld1.32 {d0},[r12,:64] |
1002 |
ldr r12, [r1], #4 |
1003 |
vld1.32 {d1},[r12,:64] |
1004 |
1: vmul.f32 q1, q0, q8 |
1005 |
subs len, len, #4 |
1006 |
beq 2f |
1007 |
ldr r12, [r1], #4 |
1008 |
vld1.32 {d0},[r12,:64] |
1009 |
ldr r12, [r1], #4 |
1010 |
vld1.32 {d1},[r12,:64] |
1011 |
vst1.32 {q1},[r0,:128]! |
1012 |
b 1b |
1013 |
2: vst1.32 {q1},[r0,:128]! |
1014 |
bx lr |
1015 |
.unreq len |
1016 |
endfunc |
1017 |
|
1018 |
function ff_sv_fmul_scalar_4_neon, export=1 |
1019 |
VFP len .req r2 |
1020 |
NOVFP len .req r3 |
1021 |
VFP vdup.32 q8, d0[0] |
1022 |
NOVFP vdup.32 q8, r2 |
1023 |
1: ldr r12, [r1], #4 |
1024 |
vld1.32 {q0},[r12,:128] |
1025 |
vmul.f32 q0, q0, q8 |
1026 |
vst1.32 {q0},[r0,:128]! |
1027 |
subs len, len, #4 |
1028 |
bgt 1b |
1029 |
bx lr |
1030 |
.unreq len |
1031 |
endfunc |
1032 |
|
1033 |
function ff_butterflies_float_neon, export=1 |
1034 |
1: vld1.32 {q0},[r0,:128] |
1035 |
vld1.32 {q1},[r1,:128] |
1036 |
vsub.f32 q2, q0, q1 |
1037 |
vadd.f32 q1, q0, q1 |
1038 |
vst1.32 {q2},[r1,:128]! |
1039 |
vst1.32 {q1},[r0,:128]! |
1040 |
subs r2, r2, #4 |
1041 |
bgt 1b |
1042 |
bx lr |
1043 |
endfunc |
1044 |
|
1045 |
function ff_scalarproduct_float_neon, export=1 |
1046 |
vmov.f32 q2, #0.0 |
1047 |
1: vld1.32 {q0},[r0,:128]! |
1048 |
vld1.32 {q1},[r1,:128]! |
1049 |
vmla.f32 q2, q0, q1 |
1050 |
subs r2, r2, #4 |
1051 |
bgt 1b |
1052 |
vadd.f32 d0, d4, d5 |
1053 |
vpadd.f32 d0, d0, d0 |
1054 |
NOVFP vmov.32 r0, d0[0] |
1055 |
bx lr |
1056 |
endfunc |
1057 |
|
1058 |
function ff_int32_to_float_fmul_scalar_neon, export=1 |
1059 |
VFP vdup.32 q0, d0[0] |
1060 |
VFP len .req r2 |
1061 |
NOVFP vdup.32 q0, r2 |
1062 |
NOVFP len .req r3 |
1063 |
|
1064 |
vld1.32 {q1},[r1,:128]! |
1065 |
vcvt.f32.s32 q3, q1 |
1066 |
vld1.32 {q2},[r1,:128]! |
1067 |
vcvt.f32.s32 q8, q2 |
1068 |
1: subs len, len, #8 |
1069 |
pld [r1, #16] |
1070 |
vmul.f32 q9, q3, q0 |
1071 |
vmul.f32 q10, q8, q0 |
1072 |
beq 2f |
1073 |
vld1.32 {q1},[r1,:128]! |
1074 |
vcvt.f32.s32 q3, q1 |
1075 |
vld1.32 {q2},[r1,:128]! |
1076 |
vcvt.f32.s32 q8, q2 |
1077 |
vst1.32 {q9}, [r0,:128]! |
1078 |
vst1.32 {q10},[r0,:128]! |
1079 |
b 1b |
1080 |
2: vst1.32 {q9}, [r0,:128]! |
1081 |
vst1.32 {q10},[r0,:128]! |
1082 |
bx lr |
1083 |
.unreq len |
1084 |
endfunc |
1085 |
|
1086 |
function ff_vector_fmul_reverse_neon, export=1 |
1087 |
add r2, r2, r3, lsl #2 |
1088 |
sub r2, r2, #32 |
1089 |
mov r12, #-32 |
1090 |
vld1.32 {q0-q1}, [r1,:128]! |
1091 |
vld1.32 {q2-q3}, [r2,:128], r12 |
1092 |
1: pld [r1, #32] |
1093 |
vrev64.32 q3, q3 |
1094 |
vmul.f32 d16, d0, d7 |
1095 |
vmul.f32 d17, d1, d6 |
1096 |
pld [r2, #-32] |
1097 |
vrev64.32 q2, q2 |
1098 |
vmul.f32 d18, d2, d5 |
1099 |
vmul.f32 d19, d3, d4 |
1100 |
subs r3, r3, #8 |
1101 |
beq 2f |
1102 |
vld1.32 {q0-q1}, [r1,:128]! |
1103 |
vld1.32 {q2-q3}, [r2,:128], r12 |
1104 |
vst1.32 {q8-q9}, [r0,:128]! |
1105 |
b 1b |
1106 |
2: vst1.32 {q8-q9}, [r0,:128]! |
1107 |
bx lr |
1108 |
endfunc |
1109 |
|
1110 |
function ff_vector_fmul_add_neon, export=1 |
1111 |
ldr r12, [sp] |
1112 |
vld1.32 {q0-q1}, [r1,:128]! |
1113 |
vld1.32 {q8-q9}, [r2,:128]! |
1114 |
vld1.32 {q2-q3}, [r3,:128]! |
1115 |
vmul.f32 q10, q0, q8 |
1116 |
vmul.f32 q11, q1, q9 |
1117 |
1: vadd.f32 q12, q2, q10 |
1118 |
vadd.f32 q13, q3, q11 |
1119 |
pld [r1, #16] |
1120 |
pld [r2, #16] |
1121 |
pld [r3, #16] |
1122 |
subs r12, r12, #8 |
1123 |
beq 2f |
1124 |
vld1.32 {q0}, [r1,:128]! |
1125 |
vld1.32 {q8}, [r2,:128]! |
1126 |
vmul.f32 q10, q0, q8 |
1127 |
vld1.32 {q1}, [r1,:128]! |
1128 |
vld1.32 {q9}, [r2,:128]! |
1129 |
vmul.f32 q11, q1, q9 |
1130 |
vld1.32 {q2-q3}, [r3,:128]! |
1131 |
vst1.32 {q12-q13},[r0,:128]! |
1132 |
b 1b |
1133 |
2: vst1.32 {q12-q13},[r0,:128]! |
1134 |
bx lr |
1135 |
endfunc |
1136 |
|
1137 |
function ff_vector_clipf_neon, export=1 |
1138 |
VFP vdup.32 q1, d0[1] |
1139 |
VFP vdup.32 q0, d0[0] |
1140 |
NOVFP vdup.32 q0, r2 |
1141 |
NOVFP vdup.32 q1, r3 |
1142 |
NOVFP ldr r2, [sp] |
1143 |
vld1.f32 {q2},[r1,:128]! |
1144 |
vmin.f32 q10, q2, q1 |
1145 |
vld1.f32 {q3},[r1,:128]! |
1146 |
vmin.f32 q11, q3, q1 |
1147 |
1: vmax.f32 q8, q10, q0 |
1148 |
vmax.f32 q9, q11, q0 |
1149 |
subs r2, r2, #8 |
1150 |
beq 2f |
1151 |
vld1.f32 {q2},[r1,:128]! |
1152 |
vmin.f32 q10, q2, q1 |
1153 |
vld1.f32 {q3},[r1,:128]! |
1154 |
vmin.f32 q11, q3, q1 |
1155 |
vst1.f32 {q8},[r0,:128]! |
1156 |
vst1.f32 {q9},[r0,:128]! |
1157 |
b 1b |
1158 |
2: vst1.f32 {q8},[r0,:128]! |
1159 |
vst1.f32 {q9},[r0,:128]! |
1160 |
bx lr |
1161 |
endfunc |