Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / dsputil_neon.S @ 0115b3ea

History | View | Annotate | Download (40 KB)

1 569f5a75 Måns Rullgård
/*
2
 * ARM NEON optimised DSP functions
3
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21
22 e814015d Måns Rullgård
#include "config.h"
23 569f5a75 Måns Rullgård
#include "asm.S"
24
25
        preserve8
26
        .text
27
28
        .macro pixels16 avg=0
29
.if \avg
30
        mov             ip,  r0
31
.endif
32
1:      vld1.64         {d0, d1},  [r1], r2
33
        vld1.64         {d2, d3},  [r1], r2
34
        vld1.64         {d4, d5},  [r1], r2
35
        pld             [r1, r2, lsl #2]
36
        vld1.64         {d6, d7},  [r1], r2
37
        pld             [r1]
38
        pld             [r1, r2]
39
        pld             [r1, r2, lsl #1]
40
.if \avg
41 d8f3f340 David Conrad
        vld1.64         {d16,d17}, [ip,:128], r2
42 569f5a75 Måns Rullgård
        vrhadd.u8       q0,  q0,  q8
43 d8f3f340 David Conrad
        vld1.64         {d18,d19}, [ip,:128], r2
44 569f5a75 Måns Rullgård
        vrhadd.u8       q1,  q1,  q9
45 d8f3f340 David Conrad
        vld1.64         {d20,d21}, [ip,:128], r2
46 569f5a75 Måns Rullgård
        vrhadd.u8       q2,  q2,  q10
47 d8f3f340 David Conrad
        vld1.64         {d22,d23}, [ip,:128], r2
48 569f5a75 Måns Rullgård
        vrhadd.u8       q3,  q3,  q11
49
.endif
50
        subs            r3,  r3,  #4
51
        vst1.64         {d0, d1},  [r0,:128], r2
52
        vst1.64         {d2, d3},  [r0,:128], r2
53
        vst1.64         {d4, d5},  [r0,:128], r2
54
        vst1.64         {d6, d7},  [r0,:128], r2
55
        bne             1b
56
        bx              lr
57
        .endm
58
59
        .macro pixels16_x2 vhadd=vrhadd.u8
60
1:      vld1.64         {d0-d2},   [r1], r2
61
        vld1.64         {d4-d6},   [r1], r2
62
        pld             [r1]
63
        pld             [r1, r2]
64
        subs            r3,  r3,  #2
65
        vext.8          q1,  q0,  q1,  #1
66
        \vhadd          q0,  q0,  q1
67
        vext.8          q3,  q2,  q3,  #1
68
        \vhadd          q2,  q2,  q3
69
        vst1.64         {d0, d1},  [r0,:128], r2
70
        vst1.64         {d4, d5},  [r0,:128], r2
71
        bne             1b
72
        bx              lr
73
        .endm
74
75
        .macro pixels16_y2 vhadd=vrhadd.u8
76 d647ed78 David Conrad
        vld1.64         {d0, d1},  [r1], r2
77
        vld1.64         {d2, d3},  [r1], r2
78 569f5a75 Måns Rullgård
1:      subs            r3,  r3,  #2
79
        \vhadd          q2,  q0,  q1
80 d647ed78 David Conrad
        vld1.64         {d0, d1},  [r1], r2
81 569f5a75 Måns Rullgård
        \vhadd          q3,  q0,  q1
82 d647ed78 David Conrad
        vld1.64         {d2, d3},  [r1], r2
83 569f5a75 Måns Rullgård
        pld             [r1]
84 d647ed78 David Conrad
        pld             [r1, r2]
85 569f5a75 Måns Rullgård
        vst1.64         {d4, d5},  [r0,:128], r2
86
        vst1.64         {d6, d7},  [r0,:128], r2
87
        bne             1b
88 d647ed78 David Conrad
        bx              lr
89 569f5a75 Måns Rullgård
        .endm
90
91
        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92 d647ed78 David Conrad
        vld1.64         {d0-d2},   [r1], r2
93
        vld1.64         {d4-d6},   [r1], r2
94 569f5a75 Måns Rullgård
.if \no_rnd
95
        vmov.i16        q13, #1
96
.endif
97
        pld             [r1]
98 d647ed78 David Conrad
        pld             [r1, r2]
99 569f5a75 Måns Rullgård
        vext.8          q1,  q0,  q1,  #1
100
        vext.8          q3,  q2,  q3,  #1
101
        vaddl.u8        q8,  d0,  d2
102
        vaddl.u8        q10, d1,  d3
103
        vaddl.u8        q9,  d4,  d6
104
        vaddl.u8        q11, d5,  d7
105
1:      subs            r3,  r3,  #2
106 d647ed78 David Conrad
        vld1.64         {d0-d2},   [r1], r2
107 569f5a75 Måns Rullgård
        vadd.u16        q12, q8,  q9
108
        pld             [r1]
109
.if \no_rnd
110
        vadd.u16        q12, q12, q13
111
.endif
112
        vext.8          q15, q0,  q1,  #1
113
        vadd.u16        q1 , q10, q11
114
        \vshrn          d28, q12, #2
115
.if \no_rnd
116
        vadd.u16        q1,  q1,  q13
117
.endif
118
        \vshrn          d29, q1,  #2
119
        vaddl.u8        q8,  d0,  d30
120 d647ed78 David Conrad
        vld1.64         {d2-d4},   [r1], r2
121 569f5a75 Måns Rullgård
        vaddl.u8        q10, d1,  d31
122
        vst1.64         {d28,d29}, [r0,:128], r2
123
        vadd.u16        q12, q8,  q9
124 d647ed78 David Conrad
        pld             [r1, r2]
125 569f5a75 Måns Rullgård
.if \no_rnd
126
        vadd.u16        q12, q12, q13
127
.endif
128
        vext.8          q2,  q1,  q2,  #1
129
        vadd.u16        q0,  q10, q11
130
        \vshrn          d30, q12, #2
131
.if \no_rnd
132
        vadd.u16        q0,  q0,  q13
133
.endif
134
        \vshrn          d31, q0,  #2
135
        vaddl.u8        q9,  d2,  d4
136
        vaddl.u8        q11, d3,  d5
137
        vst1.64         {d30,d31}, [r0,:128], r2
138
        bgt             1b
139 d647ed78 David Conrad
        bx              lr
140 569f5a75 Måns Rullgård
        .endm
141
142 bef966e3 Måns Rullgård
        .macro pixels8 avg=0
143 569f5a75 Måns Rullgård
1:      vld1.64         {d0}, [r1], r2
144
        vld1.64         {d1}, [r1], r2
145
        vld1.64         {d2}, [r1], r2
146
        pld             [r1, r2, lsl #2]
147
        vld1.64         {d3}, [r1], r2
148
        pld             [r1]
149
        pld             [r1, r2]
150
        pld             [r1, r2, lsl #1]
151 bef966e3 Måns Rullgård
.if \avg
152
        vld1.64         {d4}, [r0,:64], r2
153
        vrhadd.u8       d0,  d0,  d4
154
        vld1.64         {d5}, [r0,:64], r2
155
        vrhadd.u8       d1,  d1,  d5
156
        vld1.64         {d6}, [r0,:64], r2
157
        vrhadd.u8       d2,  d2,  d6
158
        vld1.64         {d7}, [r0,:64], r2
159
        vrhadd.u8       d3,  d3,  d7
160
        sub             r0,  r0,  r2,  lsl #2
161
.endif
162 569f5a75 Måns Rullgård
        subs            r3,  r3,  #4
163
        vst1.64         {d0}, [r0,:64], r2
164
        vst1.64         {d1}, [r0,:64], r2
165
        vst1.64         {d2}, [r0,:64], r2
166
        vst1.64         {d3}, [r0,:64], r2
167
        bne             1b
168
        bx              lr
169
        .endm
170
171
        .macro pixels8_x2 vhadd=vrhadd.u8
172
1:      vld1.64         {d0, d1},  [r1], r2
173
        vext.8          d1,  d0,  d1,  #1
174
        vld1.64         {d2, d3},  [r1], r2
175
        vext.8          d3,  d2,  d3,  #1
176
        pld             [r1]
177
        pld             [r1, r2]
178
        subs            r3,  r3,  #2
179
        vswp            d1,  d2
180
        \vhadd          q0,  q0,  q1
181
        vst1.64         {d0},      [r0,:64], r2
182
        vst1.64         {d1},      [r0,:64], r2
183
        bne             1b
184
        bx              lr
185
        .endm
186
187
        .macro pixels8_y2 vhadd=vrhadd.u8
188 d647ed78 David Conrad
        vld1.64         {d0},      [r1], r2
189
        vld1.64         {d1},      [r1], r2
190 569f5a75 Måns Rullgård
1:      subs            r3,  r3,  #2
191
        \vhadd          d4,  d0,  d1
192 d647ed78 David Conrad
        vld1.64         {d0},      [r1], r2
193 569f5a75 Måns Rullgård
        \vhadd          d5,  d0,  d1
194 d647ed78 David Conrad
        vld1.64         {d1},      [r1], r2
195 569f5a75 Måns Rullgård
        pld             [r1]
196 d647ed78 David Conrad
        pld             [r1, r2]
197 569f5a75 Måns Rullgård
        vst1.64         {d4},      [r0,:64], r2
198
        vst1.64         {d5},      [r0,:64], r2
199
        bne             1b
200 d647ed78 David Conrad
        bx              lr
201 569f5a75 Måns Rullgård
        .endm
202
203
        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
204 d647ed78 David Conrad
        vld1.64         {d0, d1},  [r1], r2
205
        vld1.64         {d2, d3},  [r1], r2
206 569f5a75 Måns Rullgård
.if \no_rnd
207
        vmov.i16        q11, #1
208
.endif
209
        pld             [r1]
210 d647ed78 David Conrad
        pld             [r1, r2]
211 569f5a75 Måns Rullgård
        vext.8          d4,  d0,  d1,  #1
212
        vext.8          d6,  d2,  d3,  #1
213
        vaddl.u8        q8,  d0,  d4
214
        vaddl.u8        q9,  d2,  d6
215
1:      subs            r3,  r3,  #2
216 d647ed78 David Conrad
        vld1.64         {d0, d1},  [r1], r2
217 569f5a75 Måns Rullgård
        pld             [r1]
218
        vadd.u16        q10, q8,  q9
219
        vext.8          d4,  d0,  d1,  #1
220
.if \no_rnd
221
        vadd.u16        q10, q10, q11
222
.endif
223
        vaddl.u8        q8,  d0,  d4
224
        \vshrn          d5,  q10, #2
225 d647ed78 David Conrad
        vld1.64         {d2, d3},  [r1], r2
226 569f5a75 Måns Rullgård
        vadd.u16        q10, q8,  q9
227 d647ed78 David Conrad
        pld             [r1, r2]
228 569f5a75 Måns Rullgård
.if \no_rnd
229
        vadd.u16        q10, q10, q11
230
.endif
231
        vst1.64         {d5},      [r0,:64], r2
232
        \vshrn          d7,  q10, #2
233
        vext.8          d6,  d2,  d3,  #1
234
        vaddl.u8        q9,  d2,  d6
235
        vst1.64         {d7},      [r0,:64], r2
236
        bgt             1b
237 d647ed78 David Conrad
        bx              lr
238 569f5a75 Måns Rullgård
        .endm
239
240
        .macro pixfunc pfx name suf rnd_op args:vararg
241
function ff_\pfx\name\suf\()_neon, export=1
242
        \name \rnd_op \args
243
        .endfunc
244
        .endm
245
246
        .macro pixfunc2 pfx name args:vararg
247
        pixfunc \pfx \name
248
        pixfunc \pfx \name \args
249
        .endm
250
251
function ff_put_h264_qpel16_mc00_neon, export=1
252 12bf71b6 Måns Rullgård
        mov             r3,  #16
253 569f5a75 Måns Rullgård
        .endfunc
254
255
        pixfunc  put_ pixels16
256
        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
257
        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
258
        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
259
260
function ff_avg_h264_qpel16_mc00_neon, export=1
261 12bf71b6 Måns Rullgård
        mov             r3,  #16
262 569f5a75 Måns Rullgård
        .endfunc
263
264
        pixfunc  avg_ pixels16,, 1
265
266
function ff_put_h264_qpel8_mc00_neon, export=1
267 12bf71b6 Måns Rullgård
        mov             r3,  #8
268 569f5a75 Måns Rullgård
        .endfunc
269
270
        pixfunc  put_ pixels8
271
        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
272
        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
273
        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
274 f23740d9 Måns Rullgård
275 bef966e3 Måns Rullgård
function ff_avg_h264_qpel8_mc00_neon, export=1
276
        mov             r3,  #8
277
        .endfunc
278
279
        pixfunc  avg_ pixels8,, 1
280
281 428bf2ac David Conrad
function ff_put_pixels_clamped_neon, export=1
282
        vld1.64         {d16-d19}, [r0,:128]!
283
        vqmovun.s16     d0, q8
284
        vld1.64         {d20-d23}, [r0,:128]!
285
        vqmovun.s16     d1, q9
286
        vld1.64         {d24-d27}, [r0,:128]!
287
        vqmovun.s16     d2, q10
288
        vld1.64         {d28-d31}, [r0,:128]!
289
        vqmovun.s16     d3, q11
290
        vst1.64         {d0},      [r1,:64], r2
291
        vqmovun.s16     d4, q12
292
        vst1.64         {d1},      [r1,:64], r2
293
        vqmovun.s16     d5, q13
294
        vst1.64         {d2},      [r1,:64], r2
295
        vqmovun.s16     d6, q14
296
        vst1.64         {d3},      [r1,:64], r2
297
        vqmovun.s16     d7, q15
298
        vst1.64         {d4},      [r1,:64], r2
299
        vst1.64         {d5},      [r1,:64], r2
300
        vst1.64         {d6},      [r1,:64], r2
301
        vst1.64         {d7},      [r1,:64], r2
302
        bx              lr
303
        .endfunc
304
305 cc2e5554 David Conrad
function ff_put_signed_pixels_clamped_neon, export=1
306
        vmov.u8         d31, #128
307
        vld1.64         {d16-d17}, [r0,:128]!
308
        vqmovn.s16      d0, q8
309
        vld1.64         {d18-d19}, [r0,:128]!
310
        vqmovn.s16      d1, q9
311
        vld1.64         {d16-d17}, [r0,:128]!
312
        vqmovn.s16      d2, q8
313
        vld1.64         {d18-d19}, [r0,:128]!
314
        vadd.u8         d0, d0, d31
315
        vld1.64         {d20-d21}, [r0,:128]!
316
        vadd.u8         d1, d1, d31
317
        vld1.64         {d22-d23}, [r0,:128]!
318
        vadd.u8         d2, d2, d31
319
        vst1.64         {d0},      [r1,:64], r2
320
        vqmovn.s16      d3, q9
321
        vst1.64         {d1},      [r1,:64], r2
322
        vqmovn.s16      d4, q10
323
        vst1.64         {d2},      [r1,:64], r2
324
        vqmovn.s16      d5, q11
325
        vld1.64         {d24-d25}, [r0,:128]!
326
        vadd.u8         d3, d3, d31
327
        vld1.64         {d26-d27}, [r0,:128]!
328
        vadd.u8         d4, d4, d31
329
        vadd.u8         d5, d5, d31
330
        vst1.64         {d3},      [r1,:64], r2
331
        vqmovn.s16      d6, q12
332
        vst1.64         {d4},      [r1,:64], r2
333
        vqmovn.s16      d7, q13
334
        vst1.64         {d5},      [r1,:64], r2
335
        vadd.u8         d6, d6, d31
336
        vadd.u8         d7, d7, d31
337
        vst1.64         {d6},      [r1,:64], r2
338
        vst1.64         {d7},      [r1,:64], r2
339
        bx              lr
340
        .endfunc
341
342 08e12b22 Måns Rullgård
function ff_add_pixels_clamped_neon, export=1
343
        mov             r3, r1
344
        vld1.64         {d16},   [r1,:64], r2
345
        vld1.64         {d0-d1}, [r0,:128]!
346
        vaddw.u8        q0, q0, d16
347
        vld1.64         {d17},   [r1,:64], r2
348
        vld1.64         {d2-d3}, [r0,:128]!
349
        vqmovun.s16     d0, q0
350
        vld1.64         {d18},   [r1,:64], r2
351
        vaddw.u8        q1, q1, d17
352
        vld1.64         {d4-d5}, [r0,:128]!
353
        vaddw.u8        q2, q2, d18
354
        vst1.64         {d0},    [r3,:64], r2
355
        vqmovun.s16     d2, q1
356
        vld1.64         {d19},   [r1,:64], r2
357
        vld1.64         {d6-d7}, [r0,:128]!
358
        vaddw.u8        q3, q3, d19
359
        vqmovun.s16     d4, q2
360
        vst1.64         {d2},    [r3,:64], r2
361
        vld1.64         {d16},   [r1,:64], r2
362
        vqmovun.s16     d6, q3
363
        vld1.64         {d0-d1}, [r0,:128]!
364
        vaddw.u8        q0, q0, d16
365
        vst1.64         {d4},    [r3,:64], r2
366
        vld1.64         {d17},   [r1,:64], r2
367
        vld1.64         {d2-d3}, [r0,:128]!
368
        vaddw.u8        q1, q1, d17
369
        vst1.64         {d6},    [r3,:64], r2
370
        vqmovun.s16     d0, q0
371
        vld1.64         {d18},   [r1,:64], r2
372
        vld1.64         {d4-d5}, [r0,:128]!
373
        vaddw.u8        q2, q2, d18
374
        vst1.64         {d0},    [r3,:64], r2
375
        vqmovun.s16     d2, q1
376
        vld1.64         {d19},   [r1,:64], r2
377
        vqmovun.s16     d4, q2
378
        vld1.64         {d6-d7}, [r0,:128]!
379
        vaddw.u8        q3, q3, d19
380
        vst1.64         {d2},    [r3,:64], r2
381
        vqmovun.s16     d6, q3
382
        vst1.64         {d4},    [r3,:64], r2
383
        vst1.64         {d6},    [r3,:64], r2
384
        bx              lr
385
        .endfunc
386
387 f23740d9 Måns Rullgård
function ff_float_to_int16_neon, export=1
388
        subs            r2,  r2,  #8
389
        vld1.64         {d0-d1},  [r1,:128]!
390
        vcvt.s32.f32    q8,  q0,  #16
391
        vld1.64         {d2-d3},  [r1,:128]!
392
        vcvt.s32.f32    q9,  q1,  #16
393
        beq             3f
394
        bics            ip,  r2,  #15
395
        beq             2f
396
1:      subs            ip,  ip,  #16
397
        vshrn.s32       d4,  q8,  #16
398
        vld1.64         {d0-d1},  [r1,:128]!
399
        vcvt.s32.f32    q0,  q0,  #16
400
        vshrn.s32       d5,  q9,  #16
401
        vld1.64         {d2-d3},  [r1,:128]!
402
        vcvt.s32.f32    q1,  q1,  #16
403
        vshrn.s32       d6,  q0,  #16
404
        vst1.64         {d4-d5},  [r0,:128]!
405
        vshrn.s32       d7,  q1,  #16
406
        vld1.64         {d16-d17},[r1,:128]!
407
        vcvt.s32.f32    q8,  q8,  #16
408
        vld1.64         {d18-d19},[r1,:128]!
409
        vcvt.s32.f32    q9,  q9,  #16
410
        vst1.64         {d6-d7},  [r0,:128]!
411
        bne             1b
412
        ands            r2,  r2,  #15
413
        beq             3f
414
2:      vld1.64         {d0-d1},  [r1,:128]!
415
        vshrn.s32       d4,  q8,  #16
416
        vcvt.s32.f32    q0,  q0,  #16
417
        vld1.64         {d2-d3},  [r1,:128]!
418
        vshrn.s32       d5,  q9,  #16
419
        vcvt.s32.f32    q1,  q1,  #16
420
        vshrn.s32       d6,  q0,  #16
421
        vst1.64         {d4-d5},  [r0,:128]!
422
        vshrn.s32       d7,  q1,  #16
423
        vst1.64         {d6-d7},  [r0,:128]!
424
        bx              lr
425
3:      vshrn.s32       d4,  q8,  #16
426
        vshrn.s32       d5,  q9,  #16
427
        vst1.64         {d4-d5},  [r0,:128]!
428
        bx              lr
429
        .endfunc
430
431
function ff_float_to_int16_interleave_neon, export=1
432
        cmp             r3, #2
433
        ldrlt           r1, [r1]
434
        blt             ff_float_to_int16_neon
435
        bne             4f
436
437
        ldr             r3, [r1]
438
        ldr             r1, [r1, #4]
439
440
        subs            r2,  r2,  #8
441
        vld1.64         {d0-d1},  [r3,:128]!
442
        vcvt.s32.f32    q8,  q0,  #16
443
        vld1.64         {d2-d3},  [r3,:128]!
444
        vcvt.s32.f32    q9,  q1,  #16
445
        vld1.64         {d20-d21},[r1,:128]!
446
        vcvt.s32.f32    q10, q10, #16
447
        vld1.64         {d22-d23},[r1,:128]!
448
        vcvt.s32.f32    q11, q11, #16
449
        beq             3f
450
        bics            ip,  r2,  #15
451
        beq             2f
452
1:      subs            ip,  ip,  #16
453
        vld1.64         {d0-d1},  [r3,:128]!
454
        vcvt.s32.f32    q0,  q0,  #16
455
        vsri.32         q10, q8,  #16
456
        vld1.64         {d2-d3},  [r3,:128]!
457
        vcvt.s32.f32    q1,  q1,  #16
458
        vld1.64         {d24-d25},[r1,:128]!
459
        vcvt.s32.f32    q12, q12, #16
460
        vld1.64         {d26-d27},[r1,:128]!
461
        vsri.32         q11, q9,  #16
462
        vst1.64         {d20-d21},[r0,:128]!
463
        vcvt.s32.f32    q13, q13, #16
464
        vst1.64         {d22-d23},[r0,:128]!
465
        vsri.32         q12, q0,  #16
466
        vld1.64         {d16-d17},[r3,:128]!
467
        vsri.32         q13, q1,  #16
468
        vst1.64         {d24-d25},[r0,:128]!
469
        vcvt.s32.f32    q8,  q8,  #16
470
        vld1.64         {d18-d19},[r3,:128]!
471
        vcvt.s32.f32    q9,  q9,  #16
472
        vld1.64         {d20-d21},[r1,:128]!
473
        vcvt.s32.f32    q10, q10, #16
474
        vld1.64         {d22-d23},[r1,:128]!
475
        vcvt.s32.f32    q11, q11, #16
476
        vst1.64         {d26-d27},[r0,:128]!
477
        bne             1b
478
        ands            r2,  r2,  #15
479
        beq             3f
480
2:      vsri.32         q10, q8,  #16
481
        vld1.64         {d0-d1},  [r3,:128]!
482
        vcvt.s32.f32    q0,  q0,  #16
483
        vld1.64         {d2-d3},  [r3,:128]!
484
        vcvt.s32.f32    q1,  q1,  #16
485
        vld1.64         {d24-d25},[r1,:128]!
486
        vcvt.s32.f32    q12, q12, #16
487
        vsri.32         q11, q9,  #16
488
        vld1.64         {d26-d27},[r1,:128]!
489
        vcvt.s32.f32    q13, q13, #16
490
        vst1.64         {d20-d21},[r0,:128]!
491
        vsri.32         q12, q0,  #16
492
        vst1.64         {d22-d23},[r0,:128]!
493
        vsri.32         q13, q1,  #16
494
        vst1.64         {d24-d27},[r0,:128]!
495
        bx              lr
496
3:      vsri.32         q10, q8,  #16
497
        vsri.32         q11, q9,  #16
498
        vst1.64         {d20-d23},[r0,:128]!
499
        bx              lr
500
501
4:      push            {r4-r8,lr}
502
        cmp             r3,  #4
503
        lsl             ip,  r3,  #1
504
        blt             4f
505
506
        @ 4 channels
507
5:      ldmia           r1!, {r4-r7}
508
        mov             lr,  r2
509
        mov             r8,  r0
510
        vld1.64         {d16-d17},[r4,:128]!
511
        vcvt.s32.f32    q8,  q8,  #16
512
        vld1.64         {d18-d19},[r5,:128]!
513
        vcvt.s32.f32    q9,  q9,  #16
514
        vld1.64         {d20-d21},[r6,:128]!
515
        vcvt.s32.f32    q10, q10, #16
516
        vld1.64         {d22-d23},[r7,:128]!
517
        vcvt.s32.f32    q11, q11, #16
518
6:      subs            lr,  lr,  #8
519
        vld1.64         {d0-d1},  [r4,:128]!
520
        vcvt.s32.f32    q0,  q0,  #16
521
        vsri.32         q9,  q8,  #16
522
        vld1.64         {d2-d3},  [r5,:128]!
523
        vcvt.s32.f32    q1,  q1,  #16
524
        vsri.32         q11, q10, #16
525
        vld1.64         {d4-d5},  [r6,:128]!
526
        vcvt.s32.f32    q2,  q2,  #16
527
        vzip.32         d18, d22
528
        vld1.64         {d6-d7},  [r7,:128]!
529
        vcvt.s32.f32    q3,  q3,  #16
530
        vzip.32         d19, d23
531
        vst1.64         {d18},    [r8], ip
532
        vsri.32         q1,  q0,  #16
533
        vst1.64         {d22},    [r8], ip
534
        vsri.32         q3,  q2,  #16
535
        vst1.64         {d19},    [r8], ip
536
        vzip.32         d2,  d6
537
        vst1.64         {d23},    [r8], ip
538
        vzip.32         d3,  d7
539
        beq             7f
540
        vld1.64         {d16-d17},[r4,:128]!
541
        vcvt.s32.f32    q8,  q8,  #16
542
        vst1.64         {d2},     [r8], ip
543
        vld1.64         {d18-d19},[r5,:128]!
544
        vcvt.s32.f32    q9,  q9,  #16
545
        vst1.64         {d6},     [r8], ip
546
        vld1.64         {d20-d21},[r6,:128]!
547
        vcvt.s32.f32    q10, q10, #16
548
        vst1.64         {d3},     [r8], ip
549
        vld1.64         {d22-d23},[r7,:128]!
550
        vcvt.s32.f32    q11, q11, #16
551
        vst1.64         {d7},     [r8], ip
552
        b               6b
553
7:      vst1.64         {d2},     [r8], ip
554
        vst1.64         {d6},     [r8], ip
555
        vst1.64         {d3},     [r8], ip
556
        vst1.64         {d7},     [r8], ip
557
        subs            r3,  r3,  #4
558
        popeq           {r4-r8,pc}
559
        cmp             r3,  #4
560
        add             r0,  r0,  #8
561
        bge             5b
562
563
        @ 2 channels
564
4:      cmp             r3,  #2
565
        blt             4f
566
        ldmia           r1!, {r4-r5}
567
        mov             lr,  r2
568
        mov             r8,  r0
569
        tst             lr,  #8
570
        vld1.64         {d16-d17},[r4,:128]!
571
        vcvt.s32.f32    q8,  q8,  #16
572
        vld1.64         {d18-d19},[r5,:128]!
573
        vcvt.s32.f32    q9,  q9,  #16
574
        vld1.64         {d20-d21},[r4,:128]!
575
        vcvt.s32.f32    q10, q10, #16
576
        vld1.64         {d22-d23},[r5,:128]!
577
        vcvt.s32.f32    q11, q11, #16
578
        beq             6f
579
        subs            lr,  lr,  #8
580
        beq             7f
581
        vsri.32         d18, d16, #16
582
        vsri.32         d19, d17, #16
583
        vld1.64         {d16-d17},[r4,:128]!
584
        vcvt.s32.f32    q8,  q8,  #16
585
        vst1.32         {d18[0]}, [r8], ip
586
        vsri.32         d22, d20, #16
587
        vst1.32         {d18[1]}, [r8], ip
588
        vsri.32         d23, d21, #16
589
        vst1.32         {d19[0]}, [r8], ip
590
        vst1.32         {d19[1]}, [r8], ip
591
        vld1.64         {d18-d19},[r5,:128]!
592
        vcvt.s32.f32    q9,  q9,  #16
593
        vst1.32         {d22[0]}, [r8], ip
594
        vst1.32         {d22[1]}, [r8], ip
595
        vld1.64         {d20-d21},[r4,:128]!
596
        vcvt.s32.f32    q10, q10, #16
597
        vst1.32         {d23[0]}, [r8], ip
598
        vst1.32         {d23[1]}, [r8], ip
599
        vld1.64         {d22-d23},[r5,:128]!
600
        vcvt.s32.f32    q11, q11, #16
601
6:      subs            lr,  lr,  #16
602
        vld1.64         {d0-d1},  [r4,:128]!
603
        vcvt.s32.f32    q0,  q0,  #16
604
        vsri.32         d18, d16, #16
605
        vld1.64         {d2-d3},  [r5,:128]!
606
        vcvt.s32.f32    q1,  q1,  #16
607
        vsri.32         d19, d17, #16
608
        vld1.64         {d4-d5},  [r4,:128]!
609
        vcvt.s32.f32    q2,  q2,  #16
610
        vld1.64         {d6-d7},  [r5,:128]!
611
        vcvt.s32.f32    q3,  q3,  #16
612
        vst1.32         {d18[0]}, [r8], ip
613
        vsri.32         d22, d20, #16
614
        vst1.32         {d18[1]}, [r8], ip
615
        vsri.32         d23, d21, #16
616
        vst1.32         {d19[0]}, [r8], ip
617
        vsri.32         d2,  d0,  #16
618
        vst1.32         {d19[1]}, [r8], ip
619
        vsri.32         d3,  d1,  #16
620
        vst1.32         {d22[0]}, [r8], ip
621
        vsri.32         d6,  d4,  #16
622
        vst1.32         {d22[1]}, [r8], ip
623
        vsri.32         d7,  d5,  #16
624
        vst1.32         {d23[0]}, [r8], ip
625
        vst1.32         {d23[1]}, [r8], ip
626
        beq             6f
627
        vld1.64         {d16-d17},[r4,:128]!
628
        vcvt.s32.f32    q8,  q8,  #16
629
        vst1.32         {d2[0]},  [r8], ip
630
        vst1.32         {d2[1]},  [r8], ip
631
        vld1.64         {d18-d19},[r5,:128]!
632
        vcvt.s32.f32    q9,  q9,  #16
633
        vst1.32         {d3[0]},  [r8], ip
634
        vst1.32         {d3[1]},  [r8], ip
635
        vld1.64         {d20-d21},[r4,:128]!
636
        vcvt.s32.f32    q10, q10, #16
637
        vst1.32         {d6[0]},  [r8], ip
638
        vst1.32         {d6[1]},  [r8], ip
639
        vld1.64         {d22-d23},[r5,:128]!
640
        vcvt.s32.f32    q11, q11, #16
641
        vst1.32         {d7[0]},  [r8], ip
642
        vst1.32         {d7[1]},  [r8], ip
643
        bgt             6b
644
6:      vst1.32         {d2[0]},  [r8], ip
645
        vst1.32         {d2[1]},  [r8], ip
646
        vst1.32         {d3[0]},  [r8], ip
647
        vst1.32         {d3[1]},  [r8], ip
648
        vst1.32         {d6[0]},  [r8], ip
649
        vst1.32         {d6[1]},  [r8], ip
650
        vst1.32         {d7[0]},  [r8], ip
651
        vst1.32         {d7[1]},  [r8], ip
652
        b               8f
653
7:      vsri.32         d18, d16, #16
654
        vsri.32         d19, d17, #16
655
        vst1.32         {d18[0]}, [r8], ip
656
        vsri.32         d22, d20, #16
657
        vst1.32         {d18[1]}, [r8], ip
658
        vsri.32         d23, d21, #16
659
        vst1.32         {d19[0]}, [r8], ip
660
        vst1.32         {d19[1]}, [r8], ip
661
        vst1.32         {d22[0]}, [r8], ip
662
        vst1.32         {d22[1]}, [r8], ip
663
        vst1.32         {d23[0]}, [r8], ip
664
        vst1.32         {d23[1]}, [r8], ip
665
8:      subs            r3,  r3,  #2
666
        add             r0,  r0,  #4
667
        popeq           {r4-r8,pc}
668
669
        @ 1 channel
670
4:      ldr             r4,  [r1],#4
671
        tst             r2,  #8
672
        mov             lr,  r2
673
        mov             r5,  r0
674
        vld1.64         {d0-d1},  [r4,:128]!
675
        vcvt.s32.f32    q0,  q0,  #16
676
        vld1.64         {d2-d3},  [r4,:128]!
677
        vcvt.s32.f32    q1,  q1,  #16
678
        bne             8f
679
6:      subs            lr,  lr,  #16
680
        vld1.64         {d4-d5},  [r4,:128]!
681
        vcvt.s32.f32    q2,  q2,  #16
682
        vld1.64         {d6-d7},  [r4,:128]!
683
        vcvt.s32.f32    q3,  q3,  #16
684
        vst1.16         {d0[1]},  [r5,:16], ip
685
        vst1.16         {d0[3]},  [r5,:16], ip
686
        vst1.16         {d1[1]},  [r5,:16], ip
687
        vst1.16         {d1[3]},  [r5,:16], ip
688
        vst1.16         {d2[1]},  [r5,:16], ip
689
        vst1.16         {d2[3]},  [r5,:16], ip
690
        vst1.16         {d3[1]},  [r5,:16], ip
691
        vst1.16         {d3[3]},  [r5,:16], ip
692
        beq             7f
693
        vld1.64         {d0-d1},  [r4,:128]!
694
        vcvt.s32.f32    q0,  q0,  #16
695
        vld1.64         {d2-d3},  [r4,:128]!
696
        vcvt.s32.f32    q1,  q1,  #16
697
7:      vst1.16         {d4[1]},  [r5,:16], ip
698
        vst1.16         {d4[3]},  [r5,:16], ip
699
        vst1.16         {d5[1]},  [r5,:16], ip
700
        vst1.16         {d5[3]},  [r5,:16], ip
701
        vst1.16         {d6[1]},  [r5,:16], ip
702
        vst1.16         {d6[3]},  [r5,:16], ip
703
        vst1.16         {d7[1]},  [r5,:16], ip
704
        vst1.16         {d7[3]},  [r5,:16], ip
705
        bgt             6b
706
        pop             {r4-r8,pc}
707
8:      subs            lr,  lr,  #8
708
        vst1.16         {d0[1]},  [r5,:16], ip
709
        vst1.16         {d0[3]},  [r5,:16], ip
710
        vst1.16         {d1[1]},  [r5,:16], ip
711
        vst1.16         {d1[3]},  [r5,:16], ip
712
        vst1.16         {d2[1]},  [r5,:16], ip
713
        vst1.16         {d2[3]},  [r5,:16], ip
714
        vst1.16         {d3[1]},  [r5,:16], ip
715
        vst1.16         {d3[3]},  [r5,:16], ip
716
        popeq           {r4-r8,pc}
717
        vld1.64         {d0-d1},  [r4,:128]!
718
        vcvt.s32.f32    q0,  q0,  #16
719
        vld1.64         {d2-d3},  [r4,:128]!
720
        vcvt.s32.f32    q1,  q1,  #16
721
        b               6b
722
        .endfunc
723 dd927e2e Måns Rullgård
724
function ff_vector_fmul_neon, export=1
725
        mov             r3,  r0
726
        subs            r2,  r2,  #8
727
        vld1.64         {d0-d3},  [r0,:128]!
728
        vld1.64         {d4-d7},  [r1,:128]!
729
        vmul.f32        q8,  q0,  q2
730
        vmul.f32        q9,  q1,  q3
731
        beq             3f
732
        bics            ip,  r2,  #15
733
        beq             2f
734
1:      subs            ip,  ip,  #16
735
        vld1.64         {d0-d1},  [r0,:128]!
736
        vld1.64         {d4-d5},  [r1,:128]!
737
        vmul.f32        q10, q0,  q2
738
        vld1.64         {d2-d3},  [r0,:128]!
739
        vld1.64         {d6-d7},  [r1,:128]!
740
        vmul.f32        q11, q1,  q3
741
        vst1.64         {d16-d19},[r3,:128]!
742
        vld1.64         {d0-d1},  [r0,:128]!
743
        vld1.64         {d4-d5},  [r1,:128]!
744
        vmul.f32        q8,  q0,  q2
745
        vld1.64         {d2-d3},  [r0,:128]!
746
        vld1.64         {d6-d7},  [r1,:128]!
747
        vmul.f32        q9,  q1,  q3
748
        vst1.64         {d20-d23},[r3,:128]!
749
        bne             1b
750
        ands            r2,  r2,  #15
751
        beq             3f
752
2:      vld1.64         {d0-d1},  [r0,:128]!
753
        vld1.64         {d4-d5},  [r1,:128]!
754
        vst1.64         {d16-d17},[r3,:128]!
755
        vmul.f32        q8,  q0,  q2
756
        vld1.64         {d2-d3},  [r0,:128]!
757
        vld1.64         {d6-d7},  [r1,:128]!
758
        vst1.64         {d18-d19},[r3,:128]!
759
        vmul.f32        q9,  q1,  q3
760
3:      vst1.64         {d16-d19},[r3,:128]!
761
        bx              lr
762
        .endfunc
763 e1f7cb7f Måns Rullgård
764
function ff_vector_fmul_window_neon, export=1
765 0a8958c8 Måns Rullgård
VFP     vdup.32         q8,  d0[0]
766
NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
767 e1f7cb7f Måns Rullgård
        push            {r4,r5,lr}
768 0a8958c8 Måns Rullgård
VFP     ldr             lr,  [sp, #12]
769
NOVFP   ldr             lr,  [sp, #16]
770 e1f7cb7f Måns Rullgård
        sub             r2,  r2,  #8
771
        sub             r5,  lr,  #2
772
        add             r2,  r2,  r5, lsl #2
773
        add             r4,  r3,  r5, lsl #3
774
        add             ip,  r0,  r5, lsl #3
775
        mov             r5,  #-16
776
        vld1.64         {d0,d1},  [r1,:128]!
777
        vld1.64         {d2,d3},  [r2,:128], r5
778
        vld1.64         {d4,d5},  [r3,:128]!
779
        vld1.64         {d6,d7},  [r4,:128], r5
780
1:      subs            lr,  lr,  #4
781
        vmov            q11, q8
782
        vmla.f32        d22, d0,  d4
783
        vmov            q10, q8
784
        vmla.f32        d23, d1,  d5
785
        vrev64.32       q3,  q3
786
        vmla.f32        d20, d0,  d7
787
        vrev64.32       q1,  q1
788
        vmla.f32        d21, d1,  d6
789
        beq             2f
790
        vmla.f32        d22, d3,  d7
791
        vld1.64         {d0,d1},  [r1,:128]!
792
        vmla.f32        d23, d2,  d6
793
        vld1.64         {d18,d19},[r2,:128], r5
794
        vmls.f32        d20, d3,  d4
795
        vld1.64         {d24,d25},[r3,:128]!
796
        vmls.f32        d21, d2,  d5
797
        vld1.64         {d6,d7},  [r4,:128], r5
798
        vmov            q1,  q9
799
        vrev64.32       q11, q11
800
        vmov            q2,  q12
801
        vswp            d22, d23
802
        vst1.64         {d20,d21},[r0,:128]!
803
        vst1.64         {d22,d23},[ip,:128], r5
804
        b               1b
805
2:      vmla.f32        d22, d3,  d7
806
        vmla.f32        d23, d2,  d6
807
        vmls.f32        d20, d3,  d4
808
        vmls.f32        d21, d2,  d5
809
        vrev64.32       q11, q11
810
        vswp            d22, d23
811
        vst1.64         {d20,d21},[r0,:128]!
812
        vst1.64         {d22,d23},[ip,:128], r5
813
        pop             {r4,r5,pc}
814
        .endfunc
815 e814015d Måns Rullgård
816
#if CONFIG_VORBIS_DECODER
817
function ff_vorbis_inverse_coupling_neon, export=1
818
        vmov.i32        q10, #1<<31
819
        subs            r2,  r2,  #4
820
        mov             r3,  r0
821
        mov             r12, r1
822
        beq             3f
823
824
        vld1.32         {d24-d25},[r1,:128]!
825
        vld1.32         {d22-d23},[r0,:128]!
826
        vcle.s32        q8,  q12, #0
827
        vand            q9,  q11, q10
828
        veor            q12, q12, q9
829
        vand            q2,  q12, q8
830
        vbic            q3,  q12, q8
831
        vadd.f32        q12, q11, q2
832
        vsub.f32        q11, q11, q3
833
1:      vld1.32         {d2-d3},  [r1,:128]!
834
        vld1.32         {d0-d1},  [r0,:128]!
835
        vcle.s32        q8,  q1,  #0
836
        vand            q9,  q0,  q10
837
        veor            q1,  q1,  q9
838
        vst1.32         {d24-d25},[r3, :128]!
839
        vst1.32         {d22-d23},[r12,:128]!
840
        vand            q2,  q1,  q8
841
        vbic            q3,  q1,  q8
842
        vadd.f32        q1,  q0,  q2
843
        vsub.f32        q0,  q0,  q3
844
        subs            r2,  r2,  #8
845
        ble             2f
846
        vld1.32         {d24-d25},[r1,:128]!
847
        vld1.32         {d22-d23},[r0,:128]!
848
        vcle.s32        q8,  q12, #0
849
        vand            q9,  q11, q10
850
        veor            q12, q12, q9
851
        vst1.32         {d2-d3},  [r3, :128]!
852
        vst1.32         {d0-d1},  [r12,:128]!
853
        vand            q2,  q12, q8
854
        vbic            q3,  q12, q8
855
        vadd.f32        q12, q11, q2
856
        vsub.f32        q11, q11, q3
857
        b               1b
858
859
2:      vst1.32         {d2-d3},  [r3, :128]!
860
        vst1.32         {d0-d1},  [r12,:128]!
861
        bxlt            lr
862
863
3:      vld1.32         {d2-d3},  [r1,:128]
864
        vld1.32         {d0-d1},  [r0,:128]
865
        vcle.s32        q8,  q1,  #0
866
        vand            q9,  q0,  q10
867
        veor            q1,  q1,  q9
868
        vand            q2,  q1,  q8
869
        vbic            q3,  q1,  q8
870
        vadd.f32        q1,  q0,  q2
871
        vsub.f32        q0,  q0,  q3
872
        vst1.32         {d2-d3},  [r0,:128]!
873
        vst1.32         {d0-d1},  [r1,:128]!
874
        bx              lr
875
        .endfunc
876
#endif
877 1dee3e97 Måns Rullgård
878
function ff_vector_fmul_scalar_neon, export=1
879
VFP     len .req r2
880
NOVFP   len .req r3
881
VFP     vdup.32         q8,  d0[0]
882
NOVFP   vdup.32         q8,  r2
883
        bics            r12, len, #15
884
        beq             3f
885
        vld1.32         {q0},[r1,:128]!
886
        vld1.32         {q1},[r1,:128]!
887
1:      vmul.f32        q0,  q0,  q8
888
        vld1.32         {q2},[r1,:128]!
889
        vmul.f32        q1,  q1,  q8
890
        vld1.32         {q3},[r1,:128]!
891
        vmul.f32        q2,  q2,  q8
892
        vst1.32         {q0},[r0,:128]!
893
        vmul.f32        q3,  q3,  q8
894
        vst1.32         {q1},[r0,:128]!
895
        subs            r12, r12, #16
896
        beq             2f
897
        vld1.32         {q0},[r1,:128]!
898
        vst1.32         {q2},[r0,:128]!
899
        vld1.32         {q1},[r1,:128]!
900
        vst1.32         {q3},[r0,:128]!
901
        b               1b
902
2:      vst1.32         {q2},[r0,:128]!
903
        vst1.32         {q3},[r0,:128]!
904
        ands            len, len, #15
905
        bxeq            lr
906
3:      vld1.32         {q0},[r1,:128]!
907
        vmul.f32        q0,  q0,  q8
908
        vst1.32         {q0},[r0,:128]!
909
        subs            len, len, #4
910
        bgt             3b
911
        bx              lr
912
        .unreq          len
913
        .endfunc
914
915
function ff_vector_fmul_sv_scalar_2_neon, export=1
916
VFP     vdup.32         d16, d0[0]
917
NOVFP   vdup.32         d16, r3
918
NOVFP   ldr             r3,  [sp]
919
        vld1.32         {d0},[r1,:64]!
920
        vld1.32         {d1},[r1,:64]!
921
1:      subs            r3,  r3,  #4
922
        vmul.f32        d4,  d0,  d16
923
        vmul.f32        d5,  d1,  d16
924
        ldr             r12, [r2], #4
925
        vld1.32         {d2},[r12,:64]
926
        ldr             r12, [r2], #4
927
        vld1.32         {d3},[r12,:64]
928
        vmul.f32        d4,  d4,  d2
929
        vmul.f32        d5,  d5,  d3
930
        beq             2f
931
        vld1.32         {d0},[r1,:64]!
932
        vld1.32         {d1},[r1,:64]!
933
        vst1.32         {d4},[r0,:64]!
934
        vst1.32         {d5},[r0,:64]!
935
        b               1b
936
2:      vst1.32         {d4},[r0,:64]!
937
        vst1.32         {d5},[r0,:64]!
938
        bx              lr
939
        .endfunc
940
941
function ff_vector_fmul_sv_scalar_4_neon, export=1
942
VFP     vdup.32         q10, d0[0]
943
NOVFP   vdup.32         q10, r3
944
NOVFP   ldr             r3,  [sp]
945
        push            {lr}
946
        bics            lr,  r3,  #7
947
        beq             3f
948
        vld1.32         {q0},[r1,:128]!
949
        vld1.32         {q2},[r1,:128]!
950
1:      ldr             r12, [r2], #4
951
        vld1.32         {q1},[r12,:128]
952
        ldr             r12, [r2], #4
953
        vld1.32         {q3},[r12,:128]
954
        vmul.f32        q8,  q0,  q10
955
        vmul.f32        q8,  q8,  q1
956
        vmul.f32        q9,  q2,  q10
957
        vmul.f32        q9,  q9,  q3
958
        subs            lr,  lr,  #8
959
        beq             2f
960
        vld1.32         {q0},[r1,:128]!
961
        vld1.32         {q2},[r1,:128]!
962
        vst1.32         {q8},[r0,:128]!
963
        vst1.32         {q9},[r0,:128]!
964
        b               1b
965
2:      vst1.32         {q8},[r0,:128]!
966
        vst1.32         {q9},[r0,:128]!
967
        ands            r3,  r3,  #7
968
        popeq           {pc}
969
3:      vld1.32         {q0},[r1,:128]!
970
        ldr             r12, [r2], #4
971
        vld1.32         {q1},[r12,:128]
972
        vmul.f32        q0,  q0,  q10
973
        vmul.f32        q0,  q0,  q1
974
        vst1.32         {q0},[r0,:128]!
975
        subs            r3,  r3,  #4
976
        bgt             3b
977
        pop             {pc}
978
        .endfunc
979
980
function ff_sv_fmul_scalar_2_neon, export=1
981
VFP     len .req r2
982
NOVFP   len .req r3
983
VFP     vdup.32         q8,  d0[0]
984
NOVFP   vdup.32         q8,  r2
985
        ldr             r12, [r1], #4
986
        vld1.32         {d0},[r12,:64]
987
        ldr             r12, [r1], #4
988
        vld1.32         {d1},[r12,:64]
989
1:      vmul.f32        q1,  q0,  q8
990
        subs            len, len, #4
991
        beq             2f
992
        ldr             r12, [r1], #4
993
        vld1.32         {d0},[r12,:64]
994
        ldr             r12, [r1], #4
995
        vld1.32         {d1},[r12,:64]
996
        vst1.32         {q1},[r0,:128]!
997
        b               1b
998
2:      vst1.32         {q1},[r0,:128]!
999
        bx              lr
1000
        .unreq          len
1001
        .endfunc
1002
1003
function ff_sv_fmul_scalar_4_neon, export=1
1004
VFP     len .req r2
1005
NOVFP   len .req r3
1006
VFP     vdup.32         q8,  d0[0]
1007
NOVFP   vdup.32         q8,  r2
1008
1:      ldr             r12, [r1], #4
1009
        vld1.32         {q0},[r12,:128]
1010
        vmul.f32        q0,  q0,  q8
1011
        vst1.32         {q0},[r0,:128]!
1012
        subs            len, len, #4
1013
        bgt             1b
1014
        bx              lr
1015
        .unreq          len
1016
        .endfunc
1017
1018
function ff_butterflies_float_neon, export=1
1019
1:      vld1.32         {q0},[r0,:128]
1020
        vld1.32         {q1},[r1,:128]
1021
        vsub.f32        q2,  q0,  q1
1022
        vadd.f32        q1,  q0,  q1
1023
        vst1.32         {q2},[r1,:128]!
1024
        vst1.32         {q1},[r0,:128]!
1025
        subs            r2,  r2,  #4
1026
        bgt             1b
1027
        bx              lr
1028
        .endfunc
1029 275cfd15 Måns Rullgård
1030
function ff_scalarproduct_float_neon, export=1
1031
        vmov.f32        q2,  #0.0
1032
1:      vld1.32         {q0},[r0,:128]!
1033
        vld1.32         {q1},[r1,:128]!
1034
        vmla.f32        q2,  q0,  q1
1035
        subs            r2,  r2,  #4
1036
        bgt             1b
1037
        vadd.f32        d0,  d4,  d5
1038
        vpadd.f32       d0,  d0,  d0
1039
NOVFP   vmov.32         r0,  d0[0]
1040
        bx              lr
1041
        .endfunc
1042 9bda7f30 Måns Rullgård
1043
function ff_int32_to_float_fmul_scalar_neon, export=1
1044
VFP     vdup.32         q0,  d0[0]
1045
VFP     len     .req    r2
1046
NOVFP   vdup.32         q0,  r2
1047
NOVFP   len     .req    r3
1048
1049
        vld1.32         {q1},[r1,:128]!
1050
        vcvt.f32.s32    q3,  q1
1051
        vld1.32         {q2},[r1,:128]!
1052
        vcvt.f32.s32    q8,  q2
1053
1:      subs            len, len, #8
1054
        pld             [r1, #16]
1055
        vmul.f32        q9,  q3,  q0
1056
        vmul.f32        q10, q8,  q0
1057
        beq             2f
1058
        vld1.32         {q1},[r1,:128]!
1059
        vcvt.f32.s32    q3,  q1
1060
        vld1.32         {q2},[r1,:128]!
1061
        vcvt.f32.s32    q8,  q2
1062
        vst1.32         {q9}, [r0,:128]!
1063
        vst1.32         {q10},[r0,:128]!
1064
        b               1b
1065
2:      vst1.32         {q9}, [r0,:128]!
1066
        vst1.32         {q10},[r0,:128]!
1067
        bx              lr
1068
        .unreq  len
1069
        .endfunc
1070 b9b1ad9c Måns Rullgård
1071
function ff_vector_fmul_reverse_neon, export=1
1072
        add             r2,  r2,  r3,  lsl #2
1073
        sub             r2,  r2,  #32
1074
        mov             r12, #-32
1075
        vld1.32         {q0-q1},  [r1,:128]!
1076
        vld1.32         {q2-q3},  [r2,:128], r12
1077
1:      pld             [r1, #32]
1078
        vrev64.32       q3,  q3
1079
        vmul.f32        d16, d0,  d7
1080
        vmul.f32        d17, d1,  d6
1081
        pld             [r2, #-32]
1082
        vrev64.32       q2,  q2
1083
        vmul.f32        d18, d2,  d5
1084
        vmul.f32        d19, d3,  d4
1085
        subs            r3,  r3,  #8
1086
        beq             2f
1087
        vld1.32         {q0-q1},  [r1,:128]!
1088
        vld1.32         {q2-q3},  [r2,:128], r12
1089
        vst1.32         {q8-q9},  [r0,:128]!
1090
        b               1b
1091
2:      vst1.32         {q8-q9},  [r0,:128]!
1092
        bx              lr
1093
        .endfunc
1094 f331cec4 Måns Rullgård
1095 ec71a8e0 Måns Rullgård
function ff_vector_fmul_add_neon, export=1
1096
        ldr             r12, [sp]
1097
        vld1.32         {q0-q1},  [r1,:128]!
1098
        vld1.32         {q8-q9},  [r2,:128]!
1099
        vld1.32         {q2-q3},  [r3,:128]!
1100
        vmul.f32        q10, q0,  q8
1101
        vmul.f32        q11, q1,  q9
1102
1:      vadd.f32        q12, q2,  q10
1103
        vadd.f32        q13, q3,  q11
1104
        pld             [r1, #16]
1105
        pld             [r2, #16]
1106
        pld             [r3, #16]
1107
        subs            r12, r12, #8
1108
        beq             2f
1109
        vld1.32         {q0},     [r1,:128]!
1110
        vld1.32         {q8},     [r2,:128]!
1111
        vmul.f32        q10, q0,  q8
1112
        vld1.32         {q1},     [r1,:128]!
1113
        vld1.32         {q9},     [r2,:128]!
1114
        vmul.f32        q11, q1,  q9
1115
        vld1.32         {q2-q3},  [r3,:128]!
1116
        vst1.32         {q12-q13},[r0,:128]!
1117
        b               1b
1118
2:      vst1.32         {q12-q13},[r0,:128]!
1119
        bx              lr
1120
        .endfunc
1121
1122 f331cec4 Måns Rullgård
function ff_vector_clipf_neon, export=1
1123
VFP     vdup.32         q1,  d0[1]
1124
VFP     vdup.32         q0,  d0[0]
1125
NOVFP   vdup.32         q0,  r2
1126
NOVFP   vdup.32         q1,  r3
1127
NOVFP   ldr             r2,  [sp]
1128
        vld1.f32        {q2},[r1,:128]!
1129
        vmin.f32        q10, q2,  q1
1130
        vld1.f32        {q3},[r1,:128]!
1131
        vmin.f32        q11, q3,  q1
1132
1:      vmax.f32        q8,  q10, q0
1133
        vmax.f32        q9,  q11, q0
1134
        subs            r2,  r2,  #8
1135
        beq             2f
1136
        vld1.f32        {q2},[r1,:128]!
1137
        vmin.f32        q10, q2,  q1
1138
        vld1.f32        {q3},[r1,:128]!
1139
        vmin.f32        q11, q3,  q1
1140
        vst1.f32        {q8},[r0,:128]!
1141
        vst1.f32        {q9},[r0,:128]!
1142
        b               1b
1143
2:      vst1.f32        {q8},[r0,:128]!
1144
        vst1.f32        {q9},[r0,:128]!
1145
        bx              lr
1146
        .endfunc