Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / dsputil_neon.S @ 2912e87a

History | View | Annotate | Download (26.4 KB)

1
/*
2
 * ARM NEON optimised DSP functions
3
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4
 *
5
 * This file is part of Libav.
6
 *
7
 * Libav is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * Libav is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with Libav; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "config.h"
23
#include "asm.S"
24

    
25
        preserve8
26
        .text
27

    
28
function ff_clear_block_neon, export=1
29
        vmov.i16        q0,  #0
30
        .rept           8
31
        vst1.16         {q0}, [r0,:128]!
32
        .endr
33
        bx              lr
34
endfunc
35

    
36
function ff_clear_blocks_neon, export=1
37
        vmov.i16        q0,  #0
38
        .rept           8*6
39
        vst1.16         {q0}, [r0,:128]!
40
        .endr
41
        bx              lr
42
endfunc
43

    
44
        .macro pixels16 avg=0
45
.if \avg
46
        mov             ip,  r0
47
.endif
48
1:      vld1.64         {d0, d1},  [r1], r2
49
        vld1.64         {d2, d3},  [r1], r2
50
        vld1.64         {d4, d5},  [r1], r2
51
        pld             [r1, r2, lsl #2]
52
        vld1.64         {d6, d7},  [r1], r2
53
        pld             [r1]
54
        pld             [r1, r2]
55
        pld             [r1, r2, lsl #1]
56
.if \avg
57
        vld1.64         {d16,d17}, [ip,:128], r2
58
        vrhadd.u8       q0,  q0,  q8
59
        vld1.64         {d18,d19}, [ip,:128], r2
60
        vrhadd.u8       q1,  q1,  q9
61
        vld1.64         {d20,d21}, [ip,:128], r2
62
        vrhadd.u8       q2,  q2,  q10
63
        vld1.64         {d22,d23}, [ip,:128], r2
64
        vrhadd.u8       q3,  q3,  q11
65
.endif
66
        subs            r3,  r3,  #4
67
        vst1.64         {d0, d1},  [r0,:128], r2
68
        vst1.64         {d2, d3},  [r0,:128], r2
69
        vst1.64         {d4, d5},  [r0,:128], r2
70
        vst1.64         {d6, d7},  [r0,:128], r2
71
        bne             1b
72
        bx              lr
73
        .endm
74

    
75
        .macro pixels16_x2 vhadd=vrhadd.u8
76
1:      vld1.64         {d0-d2},   [r1], r2
77
        vld1.64         {d4-d6},   [r1], r2
78
        pld             [r1]
79
        pld             [r1, r2]
80
        subs            r3,  r3,  #2
81
        vext.8          q1,  q0,  q1,  #1
82
        \vhadd          q0,  q0,  q1
83
        vext.8          q3,  q2,  q3,  #1
84
        \vhadd          q2,  q2,  q3
85
        vst1.64         {d0, d1},  [r0,:128], r2
86
        vst1.64         {d4, d5},  [r0,:128], r2
87
        bne             1b
88
        bx              lr
89
        .endm
90

    
91
        .macro pixels16_y2 vhadd=vrhadd.u8
92
        vld1.64         {d0, d1},  [r1], r2
93
        vld1.64         {d2, d3},  [r1], r2
94
1:      subs            r3,  r3,  #2
95
        \vhadd          q2,  q0,  q1
96
        vld1.64         {d0, d1},  [r1], r2
97
        \vhadd          q3,  q0,  q1
98
        vld1.64         {d2, d3},  [r1], r2
99
        pld             [r1]
100
        pld             [r1, r2]
101
        vst1.64         {d4, d5},  [r0,:128], r2
102
        vst1.64         {d6, d7},  [r0,:128], r2
103
        bne             1b
104
        bx              lr
105
        .endm
106

    
107
        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
108
        vld1.64         {d0-d2},   [r1], r2
109
        vld1.64         {d4-d6},   [r1], r2
110
.if \no_rnd
111
        vmov.i16        q13, #1
112
.endif
113
        pld             [r1]
114
        pld             [r1, r2]
115
        vext.8          q1,  q0,  q1,  #1
116
        vext.8          q3,  q2,  q3,  #1
117
        vaddl.u8        q8,  d0,  d2
118
        vaddl.u8        q10, d1,  d3
119
        vaddl.u8        q9,  d4,  d6
120
        vaddl.u8        q11, d5,  d7
121
1:      subs            r3,  r3,  #2
122
        vld1.64         {d0-d2},   [r1], r2
123
        vadd.u16        q12, q8,  q9
124
        pld             [r1]
125
.if \no_rnd
126
        vadd.u16        q12, q12, q13
127
.endif
128
        vext.8          q15, q0,  q1,  #1
129
        vadd.u16        q1 , q10, q11
130
        \vshrn          d28, q12, #2
131
.if \no_rnd
132
        vadd.u16        q1,  q1,  q13
133
.endif
134
        \vshrn          d29, q1,  #2
135
        vaddl.u8        q8,  d0,  d30
136
        vld1.64         {d2-d4},   [r1], r2
137
        vaddl.u8        q10, d1,  d31
138
        vst1.64         {d28,d29}, [r0,:128], r2
139
        vadd.u16        q12, q8,  q9
140
        pld             [r1, r2]
141
.if \no_rnd
142
        vadd.u16        q12, q12, q13
143
.endif
144
        vext.8          q2,  q1,  q2,  #1
145
        vadd.u16        q0,  q10, q11
146
        \vshrn          d30, q12, #2
147
.if \no_rnd
148
        vadd.u16        q0,  q0,  q13
149
.endif
150
        \vshrn          d31, q0,  #2
151
        vaddl.u8        q9,  d2,  d4
152
        vaddl.u8        q11, d3,  d5
153
        vst1.64         {d30,d31}, [r0,:128], r2
154
        bgt             1b
155
        bx              lr
156
        .endm
157

    
158
        .macro pixels8 avg=0
159
1:      vld1.64         {d0}, [r1], r2
160
        vld1.64         {d1}, [r1], r2
161
        vld1.64         {d2}, [r1], r2
162
        pld             [r1, r2, lsl #2]
163
        vld1.64         {d3}, [r1], r2
164
        pld             [r1]
165
        pld             [r1, r2]
166
        pld             [r1, r2, lsl #1]
167
.if \avg
168
        vld1.64         {d4}, [r0,:64], r2
169
        vrhadd.u8       d0,  d0,  d4
170
        vld1.64         {d5}, [r0,:64], r2
171
        vrhadd.u8       d1,  d1,  d5
172
        vld1.64         {d6}, [r0,:64], r2
173
        vrhadd.u8       d2,  d2,  d6
174
        vld1.64         {d7}, [r0,:64], r2
175
        vrhadd.u8       d3,  d3,  d7
176
        sub             r0,  r0,  r2,  lsl #2
177
.endif
178
        subs            r3,  r3,  #4
179
        vst1.64         {d0}, [r0,:64], r2
180
        vst1.64         {d1}, [r0,:64], r2
181
        vst1.64         {d2}, [r0,:64], r2
182
        vst1.64         {d3}, [r0,:64], r2
183
        bne             1b
184
        bx              lr
185
        .endm
186

    
187
        .macro pixels8_x2 vhadd=vrhadd.u8
188
1:      vld1.64         {d0, d1},  [r1], r2
189
        vext.8          d1,  d0,  d1,  #1
190
        vld1.64         {d2, d3},  [r1], r2
191
        vext.8          d3,  d2,  d3,  #1
192
        pld             [r1]
193
        pld             [r1, r2]
194
        subs            r3,  r3,  #2
195
        vswp            d1,  d2
196
        \vhadd          q0,  q0,  q1
197
        vst1.64         {d0},      [r0,:64], r2
198
        vst1.64         {d1},      [r0,:64], r2
199
        bne             1b
200
        bx              lr
201
        .endm
202

    
203
        .macro pixels8_y2 vhadd=vrhadd.u8
204
        vld1.64         {d0},      [r1], r2
205
        vld1.64         {d1},      [r1], r2
206
1:      subs            r3,  r3,  #2
207
        \vhadd          d4,  d0,  d1
208
        vld1.64         {d0},      [r1], r2
209
        \vhadd          d5,  d0,  d1
210
        vld1.64         {d1},      [r1], r2
211
        pld             [r1]
212
        pld             [r1, r2]
213
        vst1.64         {d4},      [r0,:64], r2
214
        vst1.64         {d5},      [r0,:64], r2
215
        bne             1b
216
        bx              lr
217
        .endm
218

    
219
        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
220
        vld1.64         {d0, d1},  [r1], r2
221
        vld1.64         {d2, d3},  [r1], r2
222
.if \no_rnd
223
        vmov.i16        q11, #1
224
.endif
225
        pld             [r1]
226
        pld             [r1, r2]
227
        vext.8          d4,  d0,  d1,  #1
228
        vext.8          d6,  d2,  d3,  #1
229
        vaddl.u8        q8,  d0,  d4
230
        vaddl.u8        q9,  d2,  d6
231
1:      subs            r3,  r3,  #2
232
        vld1.64         {d0, d1},  [r1], r2
233
        pld             [r1]
234
        vadd.u16        q10, q8,  q9
235
        vext.8          d4,  d0,  d1,  #1
236
.if \no_rnd
237
        vadd.u16        q10, q10, q11
238
.endif
239
        vaddl.u8        q8,  d0,  d4
240
        \vshrn          d5,  q10, #2
241
        vld1.64         {d2, d3},  [r1], r2
242
        vadd.u16        q10, q8,  q9
243
        pld             [r1, r2]
244
.if \no_rnd
245
        vadd.u16        q10, q10, q11
246
.endif
247
        vst1.64         {d5},      [r0,:64], r2
248
        \vshrn          d7,  q10, #2
249
        vext.8          d6,  d2,  d3,  #1
250
        vaddl.u8        q9,  d2,  d6
251
        vst1.64         {d7},      [r0,:64], r2
252
        bgt             1b
253
        bx              lr
254
        .endm
255

    
256
        .macro pixfunc pfx name suf rnd_op args:vararg
257
function ff_\pfx\name\suf\()_neon, export=1
258
        \name \rnd_op \args
259
endfunc
260
        .endm
261

    
262
        .macro pixfunc2 pfx name args:vararg
263
        pixfunc \pfx \name
264
        pixfunc \pfx \name \args
265
        .endm
266

    
267
function ff_put_h264_qpel16_mc00_neon, export=1
268
        mov             r3,  #16
269
endfunc
270

    
271
        pixfunc  put_ pixels16
272
        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
273
        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
274
        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
275

    
276
function ff_avg_h264_qpel16_mc00_neon, export=1
277
        mov             r3,  #16
278
endfunc
279

    
280
        pixfunc  avg_ pixels16,, 1
281

    
282
function ff_put_h264_qpel8_mc00_neon, export=1
283
        mov             r3,  #8
284
endfunc
285

    
286
        pixfunc  put_ pixels8
287
        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
288
        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
289
        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
290

    
291
function ff_avg_h264_qpel8_mc00_neon, export=1
292
        mov             r3,  #8
293
endfunc
294

    
295
        pixfunc  avg_ pixels8,, 1
296

    
297
function ff_put_pixels_clamped_neon, export=1
298
        vld1.64         {d16-d19}, [r0,:128]!
299
        vqmovun.s16     d0, q8
300
        vld1.64         {d20-d23}, [r0,:128]!
301
        vqmovun.s16     d1, q9
302
        vld1.64         {d24-d27}, [r0,:128]!
303
        vqmovun.s16     d2, q10
304
        vld1.64         {d28-d31}, [r0,:128]!
305
        vqmovun.s16     d3, q11
306
        vst1.64         {d0},      [r1,:64], r2
307
        vqmovun.s16     d4, q12
308
        vst1.64         {d1},      [r1,:64], r2
309
        vqmovun.s16     d5, q13
310
        vst1.64         {d2},      [r1,:64], r2
311
        vqmovun.s16     d6, q14
312
        vst1.64         {d3},      [r1,:64], r2
313
        vqmovun.s16     d7, q15
314
        vst1.64         {d4},      [r1,:64], r2
315
        vst1.64         {d5},      [r1,:64], r2
316
        vst1.64         {d6},      [r1,:64], r2
317
        vst1.64         {d7},      [r1,:64], r2
318
        bx              lr
319
endfunc
320

    
321
function ff_put_signed_pixels_clamped_neon, export=1
322
        vmov.u8         d31, #128
323
        vld1.64         {d16-d17}, [r0,:128]!
324
        vqmovn.s16      d0, q8
325
        vld1.64         {d18-d19}, [r0,:128]!
326
        vqmovn.s16      d1, q9
327
        vld1.64         {d16-d17}, [r0,:128]!
328
        vqmovn.s16      d2, q8
329
        vld1.64         {d18-d19}, [r0,:128]!
330
        vadd.u8         d0, d0, d31
331
        vld1.64         {d20-d21}, [r0,:128]!
332
        vadd.u8         d1, d1, d31
333
        vld1.64         {d22-d23}, [r0,:128]!
334
        vadd.u8         d2, d2, d31
335
        vst1.64         {d0},      [r1,:64], r2
336
        vqmovn.s16      d3, q9
337
        vst1.64         {d1},      [r1,:64], r2
338
        vqmovn.s16      d4, q10
339
        vst1.64         {d2},      [r1,:64], r2
340
        vqmovn.s16      d5, q11
341
        vld1.64         {d24-d25}, [r0,:128]!
342
        vadd.u8         d3, d3, d31
343
        vld1.64         {d26-d27}, [r0,:128]!
344
        vadd.u8         d4, d4, d31
345
        vadd.u8         d5, d5, d31
346
        vst1.64         {d3},      [r1,:64], r2
347
        vqmovn.s16      d6, q12
348
        vst1.64         {d4},      [r1,:64], r2
349
        vqmovn.s16      d7, q13
350
        vst1.64         {d5},      [r1,:64], r2
351
        vadd.u8         d6, d6, d31
352
        vadd.u8         d7, d7, d31
353
        vst1.64         {d6},      [r1,:64], r2
354
        vst1.64         {d7},      [r1,:64], r2
355
        bx              lr
356
endfunc
357

    
358
function ff_add_pixels_clamped_neon, export=1
359
        mov             r3, r1
360
        vld1.64         {d16},   [r1,:64], r2
361
        vld1.64         {d0-d1}, [r0,:128]!
362
        vaddw.u8        q0, q0, d16
363
        vld1.64         {d17},   [r1,:64], r2
364
        vld1.64         {d2-d3}, [r0,:128]!
365
        vqmovun.s16     d0, q0
366
        vld1.64         {d18},   [r1,:64], r2
367
        vaddw.u8        q1, q1, d17
368
        vld1.64         {d4-d5}, [r0,:128]!
369
        vaddw.u8        q2, q2, d18
370
        vst1.64         {d0},    [r3,:64], r2
371
        vqmovun.s16     d2, q1
372
        vld1.64         {d19},   [r1,:64], r2
373
        vld1.64         {d6-d7}, [r0,:128]!
374
        vaddw.u8        q3, q3, d19
375
        vqmovun.s16     d4, q2
376
        vst1.64         {d2},    [r3,:64], r2
377
        vld1.64         {d16},   [r1,:64], r2
378
        vqmovun.s16     d6, q3
379
        vld1.64         {d0-d1}, [r0,:128]!
380
        vaddw.u8        q0, q0, d16
381
        vst1.64         {d4},    [r3,:64], r2
382
        vld1.64         {d17},   [r1,:64], r2
383
        vld1.64         {d2-d3}, [r0,:128]!
384
        vaddw.u8        q1, q1, d17
385
        vst1.64         {d6},    [r3,:64], r2
386
        vqmovun.s16     d0, q0
387
        vld1.64         {d18},   [r1,:64], r2
388
        vld1.64         {d4-d5}, [r0,:128]!
389
        vaddw.u8        q2, q2, d18
390
        vst1.64         {d0},    [r3,:64], r2
391
        vqmovun.s16     d2, q1
392
        vld1.64         {d19},   [r1,:64], r2
393
        vqmovun.s16     d4, q2
394
        vld1.64         {d6-d7}, [r0,:128]!
395
        vaddw.u8        q3, q3, d19
396
        vst1.64         {d2},    [r3,:64], r2
397
        vqmovun.s16     d6, q3
398
        vst1.64         {d4},    [r3,:64], r2
399
        vst1.64         {d6},    [r3,:64], r2
400
        bx              lr
401
endfunc
402

    
403
function ff_vector_fmul_neon, export=1
404
        subs            r3,  r3,  #8
405
        vld1.64         {d0-d3},  [r1,:128]!
406
        vld1.64         {d4-d7},  [r2,:128]!
407
        vmul.f32        q8,  q0,  q2
408
        vmul.f32        q9,  q1,  q3
409
        beq             3f
410
        bics            ip,  r3,  #15
411
        beq             2f
412
1:      subs            ip,  ip,  #16
413
        vld1.64         {d0-d1},  [r1,:128]!
414
        vld1.64         {d4-d5},  [r2,:128]!
415
        vmul.f32        q10, q0,  q2
416
        vld1.64         {d2-d3},  [r1,:128]!
417
        vld1.64         {d6-d7},  [r2,:128]!
418
        vmul.f32        q11, q1,  q3
419
        vst1.64         {d16-d19},[r0,:128]!
420
        vld1.64         {d0-d1},  [r1,:128]!
421
        vld1.64         {d4-d5},  [r2,:128]!
422
        vmul.f32        q8,  q0,  q2
423
        vld1.64         {d2-d3},  [r1,:128]!
424
        vld1.64         {d6-d7},  [r2,:128]!
425
        vmul.f32        q9,  q1,  q3
426
        vst1.64         {d20-d23},[r0,:128]!
427
        bne             1b
428
        ands            r3,  r3,  #15
429
        beq             3f
430
2:      vld1.64         {d0-d1},  [r1,:128]!
431
        vld1.64         {d4-d5},  [r2,:128]!
432
        vst1.64         {d16-d17},[r0,:128]!
433
        vmul.f32        q8,  q0,  q2
434
        vld1.64         {d2-d3},  [r1,:128]!
435
        vld1.64         {d6-d7},  [r2,:128]!
436
        vst1.64         {d18-d19},[r0,:128]!
437
        vmul.f32        q9,  q1,  q3
438
3:      vst1.64         {d16-d19},[r0,:128]!
439
        bx              lr
440
endfunc
441

    
442
function ff_vector_fmul_window_neon, export=1
443
        push            {r4,r5,lr}
444
        ldr             lr,  [sp, #12]
445
        sub             r2,  r2,  #8
446
        sub             r5,  lr,  #2
447
        add             r2,  r2,  r5, lsl #2
448
        add             r4,  r3,  r5, lsl #3
449
        add             ip,  r0,  r5, lsl #3
450
        mov             r5,  #-16
451
        vld1.64         {d0,d1},  [r1,:128]!
452
        vld1.64         {d2,d3},  [r2,:128], r5
453
        vld1.64         {d4,d5},  [r3,:128]!
454
        vld1.64         {d6,d7},  [r4,:128], r5
455
1:      subs            lr,  lr,  #4
456
        vmul.f32        d22, d0,  d4
457
        vrev64.32       q3,  q3
458
        vmul.f32        d23, d1,  d5
459
        vrev64.32       q1,  q1
460
        vmul.f32        d20, d0,  d7
461
        vmul.f32        d21, d1,  d6
462
        beq             2f
463
        vmla.f32        d22, d3,  d7
464
        vld1.64         {d0,d1},  [r1,:128]!
465
        vmla.f32        d23, d2,  d6
466
        vld1.64         {d18,d19},[r2,:128], r5
467
        vmls.f32        d20, d3,  d4
468
        vld1.64         {d24,d25},[r3,:128]!
469
        vmls.f32        d21, d2,  d5
470
        vld1.64         {d6,d7},  [r4,:128], r5
471
        vmov            q1,  q9
472
        vrev64.32       q11, q11
473
        vmov            q2,  q12
474
        vswp            d22, d23
475
        vst1.64         {d20,d21},[r0,:128]!
476
        vst1.64         {d22,d23},[ip,:128], r5
477
        b               1b
478
2:      vmla.f32        d22, d3,  d7
479
        vmla.f32        d23, d2,  d6
480
        vmls.f32        d20, d3,  d4
481
        vmls.f32        d21, d2,  d5
482
        vrev64.32       q11, q11
483
        vswp            d22, d23
484
        vst1.64         {d20,d21},[r0,:128]!
485
        vst1.64         {d22,d23},[ip,:128], r5
486
        pop             {r4,r5,pc}
487
endfunc
488

    
489
#if CONFIG_VORBIS_DECODER
490
function ff_vorbis_inverse_coupling_neon, export=1
491
        vmov.i32        q10, #1<<31
492
        subs            r2,  r2,  #4
493
        mov             r3,  r0
494
        mov             r12, r1
495
        beq             3f
496

    
497
        vld1.32         {d24-d25},[r1,:128]!
498
        vld1.32         {d22-d23},[r0,:128]!
499
        vcle.s32        q8,  q12, #0
500
        vand            q9,  q11, q10
501
        veor            q12, q12, q9
502
        vand            q2,  q12, q8
503
        vbic            q3,  q12, q8
504
        vadd.f32        q12, q11, q2
505
        vsub.f32        q11, q11, q3
506
1:      vld1.32         {d2-d3},  [r1,:128]!
507
        vld1.32         {d0-d1},  [r0,:128]!
508
        vcle.s32        q8,  q1,  #0
509
        vand            q9,  q0,  q10
510
        veor            q1,  q1,  q9
511
        vst1.32         {d24-d25},[r3, :128]!
512
        vst1.32         {d22-d23},[r12,:128]!
513
        vand            q2,  q1,  q8
514
        vbic            q3,  q1,  q8
515
        vadd.f32        q1,  q0,  q2
516
        vsub.f32        q0,  q0,  q3
517
        subs            r2,  r2,  #8
518
        ble             2f
519
        vld1.32         {d24-d25},[r1,:128]!
520
        vld1.32         {d22-d23},[r0,:128]!
521
        vcle.s32        q8,  q12, #0
522
        vand            q9,  q11, q10
523
        veor            q12, q12, q9
524
        vst1.32         {d2-d3},  [r3, :128]!
525
        vst1.32         {d0-d1},  [r12,:128]!
526
        vand            q2,  q12, q8
527
        vbic            q3,  q12, q8
528
        vadd.f32        q12, q11, q2
529
        vsub.f32        q11, q11, q3
530
        b               1b
531

    
532
2:      vst1.32         {d2-d3},  [r3, :128]!
533
        vst1.32         {d0-d1},  [r12,:128]!
534
        bxlt            lr
535

    
536
3:      vld1.32         {d2-d3},  [r1,:128]
537
        vld1.32         {d0-d1},  [r0,:128]
538
        vcle.s32        q8,  q1,  #0
539
        vand            q9,  q0,  q10
540
        veor            q1,  q1,  q9
541
        vand            q2,  q1,  q8
542
        vbic            q3,  q1,  q8
543
        vadd.f32        q1,  q0,  q2
544
        vsub.f32        q0,  q0,  q3
545
        vst1.32         {d2-d3},  [r0,:128]!
546
        vst1.32         {d0-d1},  [r1,:128]!
547
        bx              lr
548
endfunc
549
#endif
550

    
551
function ff_vector_fmul_scalar_neon, export=1
552
VFP     len .req r2
553
NOVFP   len .req r3
554
VFP     vdup.32         q8,  d0[0]
555
NOVFP   vdup.32         q8,  r2
556
        bics            r12, len, #15
557
        beq             3f
558
        vld1.32         {q0},[r1,:128]!
559
        vld1.32         {q1},[r1,:128]!
560
1:      vmul.f32        q0,  q0,  q8
561
        vld1.32         {q2},[r1,:128]!
562
        vmul.f32        q1,  q1,  q8
563
        vld1.32         {q3},[r1,:128]!
564
        vmul.f32        q2,  q2,  q8
565
        vst1.32         {q0},[r0,:128]!
566
        vmul.f32        q3,  q3,  q8
567
        vst1.32         {q1},[r0,:128]!
568
        subs            r12, r12, #16
569
        beq             2f
570
        vld1.32         {q0},[r1,:128]!
571
        vst1.32         {q2},[r0,:128]!
572
        vld1.32         {q1},[r1,:128]!
573
        vst1.32         {q3},[r0,:128]!
574
        b               1b
575
2:      vst1.32         {q2},[r0,:128]!
576
        vst1.32         {q3},[r0,:128]!
577
        ands            len, len, #15
578
        bxeq            lr
579
3:      vld1.32         {q0},[r1,:128]!
580
        vmul.f32        q0,  q0,  q8
581
        vst1.32         {q0},[r0,:128]!
582
        subs            len, len, #4
583
        bgt             3b
584
        bx              lr
585
        .unreq          len
586
endfunc
587

    
588
function ff_vector_fmul_sv_scalar_2_neon, export=1
589
VFP     vdup.32         d16, d0[0]
590
NOVFP   vdup.32         d16, r3
591
NOVFP   ldr             r3,  [sp]
592
        vld1.32         {d0},[r1,:64]!
593
        vld1.32         {d1},[r1,:64]!
594
1:      subs            r3,  r3,  #4
595
        vmul.f32        d4,  d0,  d16
596
        vmul.f32        d5,  d1,  d16
597
        ldr             r12, [r2], #4
598
        vld1.32         {d2},[r12,:64]
599
        ldr             r12, [r2], #4
600
        vld1.32         {d3},[r12,:64]
601
        vmul.f32        d4,  d4,  d2
602
        vmul.f32        d5,  d5,  d3
603
        beq             2f
604
        vld1.32         {d0},[r1,:64]!
605
        vld1.32         {d1},[r1,:64]!
606
        vst1.32         {d4},[r0,:64]!
607
        vst1.32         {d5},[r0,:64]!
608
        b               1b
609
2:      vst1.32         {d4},[r0,:64]!
610
        vst1.32         {d5},[r0,:64]!
611
        bx              lr
612
endfunc
613

    
614
function ff_vector_fmul_sv_scalar_4_neon, export=1
615
VFP     vdup.32         q10, d0[0]
616
NOVFP   vdup.32         q10, r3
617
NOVFP   ldr             r3,  [sp]
618
        push            {lr}
619
        bics            lr,  r3,  #7
620
        beq             3f
621
        vld1.32         {q0},[r1,:128]!
622
        vld1.32         {q2},[r1,:128]!
623
1:      ldr             r12, [r2], #4
624
        vld1.32         {q1},[r12,:128]
625
        ldr             r12, [r2], #4
626
        vld1.32         {q3},[r12,:128]
627
        vmul.f32        q8,  q0,  q10
628
        vmul.f32        q8,  q8,  q1
629
        vmul.f32        q9,  q2,  q10
630
        vmul.f32        q9,  q9,  q3
631
        subs            lr,  lr,  #8
632
        beq             2f
633
        vld1.32         {q0},[r1,:128]!
634
        vld1.32         {q2},[r1,:128]!
635
        vst1.32         {q8},[r0,:128]!
636
        vst1.32         {q9},[r0,:128]!
637
        b               1b
638
2:      vst1.32         {q8},[r0,:128]!
639
        vst1.32         {q9},[r0,:128]!
640
        ands            r3,  r3,  #7
641
        popeq           {pc}
642
3:      vld1.32         {q0},[r1,:128]!
643
        ldr             r12, [r2], #4
644
        vld1.32         {q1},[r12,:128]
645
        vmul.f32        q0,  q0,  q10
646
        vmul.f32        q0,  q0,  q1
647
        vst1.32         {q0},[r0,:128]!
648
        subs            r3,  r3,  #4
649
        bgt             3b
650
        pop             {pc}
651
endfunc
652

    
653
function ff_sv_fmul_scalar_2_neon, export=1
654
VFP     len .req r2
655
NOVFP   len .req r3
656
VFP     vdup.32         q8,  d0[0]
657
NOVFP   vdup.32         q8,  r2
658
        ldr             r12, [r1], #4
659
        vld1.32         {d0},[r12,:64]
660
        ldr             r12, [r1], #4
661
        vld1.32         {d1},[r12,:64]
662
1:      vmul.f32        q1,  q0,  q8
663
        subs            len, len, #4
664
        beq             2f
665
        ldr             r12, [r1], #4
666
        vld1.32         {d0},[r12,:64]
667
        ldr             r12, [r1], #4
668
        vld1.32         {d1},[r12,:64]
669
        vst1.32         {q1},[r0,:128]!
670
        b               1b
671
2:      vst1.32         {q1},[r0,:128]!
672
        bx              lr
673
        .unreq          len
674
endfunc
675

    
676
function ff_sv_fmul_scalar_4_neon, export=1
677
VFP     len .req r2
678
NOVFP   len .req r3
679
VFP     vdup.32         q8,  d0[0]
680
NOVFP   vdup.32         q8,  r2
681
1:      ldr             r12, [r1], #4
682
        vld1.32         {q0},[r12,:128]
683
        vmul.f32        q0,  q0,  q8
684
        vst1.32         {q0},[r0,:128]!
685
        subs            len, len, #4
686
        bgt             1b
687
        bx              lr
688
        .unreq          len
689
endfunc
690

    
691
function ff_butterflies_float_neon, export=1
692
1:      vld1.32         {q0},[r0,:128]
693
        vld1.32         {q1},[r1,:128]
694
        vsub.f32        q2,  q0,  q1
695
        vadd.f32        q1,  q0,  q1
696
        vst1.32         {q2},[r1,:128]!
697
        vst1.32         {q1},[r0,:128]!
698
        subs            r2,  r2,  #4
699
        bgt             1b
700
        bx              lr
701
endfunc
702

    
703
function ff_scalarproduct_float_neon, export=1
704
        vmov.f32        q2,  #0.0
705
1:      vld1.32         {q0},[r0,:128]!
706
        vld1.32         {q1},[r1,:128]!
707
        vmla.f32        q2,  q0,  q1
708
        subs            r2,  r2,  #4
709
        bgt             1b
710
        vadd.f32        d0,  d4,  d5
711
        vpadd.f32       d0,  d0,  d0
712
NOVFP   vmov.32         r0,  d0[0]
713
        bx              lr
714
endfunc
715

    
716
function ff_vector_fmul_reverse_neon, export=1
717
        add             r2,  r2,  r3,  lsl #2
718
        sub             r2,  r2,  #32
719
        mov             r12, #-32
720
        vld1.32         {q0-q1},  [r1,:128]!
721
        vld1.32         {q2-q3},  [r2,:128], r12
722
1:      pld             [r1, #32]
723
        vrev64.32       q3,  q3
724
        vmul.f32        d16, d0,  d7
725
        vmul.f32        d17, d1,  d6
726
        pld             [r2, #-32]
727
        vrev64.32       q2,  q2
728
        vmul.f32        d18, d2,  d5
729
        vmul.f32        d19, d3,  d4
730
        subs            r3,  r3,  #8
731
        beq             2f
732
        vld1.32         {q0-q1},  [r1,:128]!
733
        vld1.32         {q2-q3},  [r2,:128], r12
734
        vst1.32         {q8-q9},  [r0,:128]!
735
        b               1b
736
2:      vst1.32         {q8-q9},  [r0,:128]!
737
        bx              lr
738
endfunc
739

    
740
function ff_vector_fmul_add_neon, export=1
741
        ldr             r12, [sp]
742
        vld1.32         {q0-q1},  [r1,:128]!
743
        vld1.32         {q8-q9},  [r2,:128]!
744
        vld1.32         {q2-q3},  [r3,:128]!
745
        vmul.f32        q10, q0,  q8
746
        vmul.f32        q11, q1,  q9
747
1:      vadd.f32        q12, q2,  q10
748
        vadd.f32        q13, q3,  q11
749
        pld             [r1, #16]
750
        pld             [r2, #16]
751
        pld             [r3, #16]
752
        subs            r12, r12, #8
753
        beq             2f
754
        vld1.32         {q0},     [r1,:128]!
755
        vld1.32         {q8},     [r2,:128]!
756
        vmul.f32        q10, q0,  q8
757
        vld1.32         {q1},     [r1,:128]!
758
        vld1.32         {q9},     [r2,:128]!
759
        vmul.f32        q11, q1,  q9
760
        vld1.32         {q2-q3},  [r3,:128]!
761
        vst1.32         {q12-q13},[r0,:128]!
762
        b               1b
763
2:      vst1.32         {q12-q13},[r0,:128]!
764
        bx              lr
765
endfunc
766

    
767
function ff_vector_clipf_neon, export=1
768
VFP     vdup.32         q1,  d0[1]
769
VFP     vdup.32         q0,  d0[0]
770
NOVFP   vdup.32         q0,  r2
771
NOVFP   vdup.32         q1,  r3
772
NOVFP   ldr             r2,  [sp]
773
        vld1.f32        {q2},[r1,:128]!
774
        vmin.f32        q10, q2,  q1
775
        vld1.f32        {q3},[r1,:128]!
776
        vmin.f32        q11, q3,  q1
777
1:      vmax.f32        q8,  q10, q0
778
        vmax.f32        q9,  q11, q0
779
        subs            r2,  r2,  #8
780
        beq             2f
781
        vld1.f32        {q2},[r1,:128]!
782
        vmin.f32        q10, q2,  q1
783
        vld1.f32        {q3},[r1,:128]!
784
        vmin.f32        q11, q3,  q1
785
        vst1.f32        {q8},[r0,:128]!
786
        vst1.f32        {q9},[r0,:128]!
787
        b               1b
788
2:      vst1.f32        {q8},[r0,:128]!
789
        vst1.f32        {q9},[r0,:128]!
790
        bx              lr
791
endfunc