Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / dsputil_neon_s.S @ e1f7cb7f

History | View | Annotate | Download (24.8 KB)

1
/*
2
 * ARM NEON optimised DSP functions
3
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "asm.S"
23

    
24
        preserve8
25
        .fpu neon
26
        .text
27

    
28
        .macro pixels16 avg=0
29
.if \avg
30
        mov             ip,  r0
31
.endif
32
1:      vld1.64         {d0, d1},  [r1], r2
33
        vld1.64         {d2, d3},  [r1], r2
34
        vld1.64         {d4, d5},  [r1], r2
35
        pld             [r1, r2, lsl #2]
36
        vld1.64         {d6, d7},  [r1], r2
37
        pld             [r1]
38
        pld             [r1, r2]
39
        pld             [r1, r2, lsl #1]
40
.if \avg
41
        vld1.64         {d16,d17}, [ip], r2
42
        vrhadd.u8       q0,  q0,  q8
43
        vld1.64         {d18,d19}, [ip], r2
44
        vrhadd.u8       q1,  q1,  q9
45
        vld1.64         {d20,d21}, [ip], r2
46
        vrhadd.u8       q2,  q2,  q10
47
        vld1.64         {d22,d23}, [ip], r2
48
        vrhadd.u8       q3,  q3,  q11
49
.endif
50
        subs            r3,  r3,  #4
51
        vst1.64         {d0, d1},  [r0,:128], r2
52
        vst1.64         {d2, d3},  [r0,:128], r2
53
        vst1.64         {d4, d5},  [r0,:128], r2
54
        vst1.64         {d6, d7},  [r0,:128], r2
55
        bne             1b
56
        bx              lr
57
        .endm
58

    
59
        .macro pixels16_x2 vhadd=vrhadd.u8
60
1:      vld1.64         {d0-d2},   [r1], r2
61
        vld1.64         {d4-d6},   [r1], r2
62
        pld             [r1]
63
        pld             [r1, r2]
64
        subs            r3,  r3,  #2
65
        vext.8          q1,  q0,  q1,  #1
66
        \vhadd          q0,  q0,  q1
67
        vext.8          q3,  q2,  q3,  #1
68
        \vhadd          q2,  q2,  q3
69
        vst1.64         {d0, d1},  [r0,:128], r2
70
        vst1.64         {d4, d5},  [r0,:128], r2
71
        bne             1b
72
        bx              lr
73
        .endm
74

    
75
        .macro pixels16_y2 vhadd=vrhadd.u8
76
        push            {lr}
77
        add             ip,  r1,  r2
78
        lsl             lr,  r2,  #1
79
        vld1.64         {d0, d1},  [r1], lr
80
        vld1.64         {d2, d3},  [ip], lr
81
1:      subs            r3,  r3,  #2
82
        \vhadd          q2,  q0,  q1
83
        vld1.64         {d0, d1},  [r1],      lr
84
        \vhadd          q3,  q0,  q1
85
        vld1.64         {d2, d3},  [ip],      lr
86
        pld             [r1]
87
        pld             [ip]
88
        vst1.64         {d4, d5},  [r0,:128], r2
89
        vst1.64         {d6, d7},  [r0,:128], r2
90
        bne             1b
91
        pop             {pc}
92
        .endm
93

    
94
        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
95
        push            {lr}
96
        lsl             lr,  r2,  #1
97
        add             ip,  r1,  r2
98
        vld1.64         {d0-d2},   [r1], lr
99
        vld1.64         {d4-d6},   [ip], lr
100
.if \no_rnd
101
        vmov.i16        q13, #1
102
.endif
103
        pld             [r1]
104
        pld             [ip]
105
        vext.8          q1,  q0,  q1,  #1
106
        vext.8          q3,  q2,  q3,  #1
107
        vaddl.u8        q8,  d0,  d2
108
        vaddl.u8        q10, d1,  d3
109
        vaddl.u8        q9,  d4,  d6
110
        vaddl.u8        q11, d5,  d7
111
1:      subs            r3,  r3,  #2
112
        vld1.64         {d0-d2},   [r1], lr
113
        vadd.u16        q12, q8,  q9
114
        pld             [r1]
115
.if \no_rnd
116
        vadd.u16        q12, q12, q13
117
.endif
118
        vext.8          q15, q0,  q1,  #1
119
        vadd.u16        q1 , q10, q11
120
        \vshrn          d28, q12, #2
121
.if \no_rnd
122
        vadd.u16        q1,  q1,  q13
123
.endif
124
        \vshrn          d29, q1,  #2
125
        vaddl.u8        q8,  d0,  d30
126
        vld1.64         {d2-d4},   [ip], lr
127
        vaddl.u8        q10, d1,  d31
128
        vst1.64         {d28,d29}, [r0,:128], r2
129
        vadd.u16        q12, q8,  q9
130
        pld             [ip]
131
.if \no_rnd
132
        vadd.u16        q12, q12, q13
133
.endif
134
        vext.8          q2,  q1,  q2,  #1
135
        vadd.u16        q0,  q10, q11
136
        \vshrn          d30, q12, #2
137
.if \no_rnd
138
        vadd.u16        q0,  q0,  q13
139
.endif
140
        \vshrn          d31, q0,  #2
141
        vaddl.u8        q9,  d2,  d4
142
        vaddl.u8        q11, d3,  d5
143
        vst1.64         {d30,d31}, [r0,:128], r2
144
        bgt             1b
145
        pop             {pc}
146
        .endm
147

    
148
        .macro pixels8
149
1:      vld1.64         {d0}, [r1], r2
150
        vld1.64         {d1}, [r1], r2
151
        vld1.64         {d2}, [r1], r2
152
        pld             [r1, r2, lsl #2]
153
        vld1.64         {d3}, [r1], r2
154
        pld             [r1]
155
        pld             [r1, r2]
156
        pld             [r1, r2, lsl #1]
157
        subs            r3,  r3,  #4
158
        vst1.64         {d0}, [r0,:64], r2
159
        vst1.64         {d1}, [r0,:64], r2
160
        vst1.64         {d2}, [r0,:64], r2
161
        vst1.64         {d3}, [r0,:64], r2
162
        bne             1b
163
        bx              lr
164
        .endm
165

    
166
        .macro pixels8_x2 vhadd=vrhadd.u8
167
1:      vld1.64         {d0, d1},  [r1], r2
168
        vext.8          d1,  d0,  d1,  #1
169
        vld1.64         {d2, d3},  [r1], r2
170
        vext.8          d3,  d2,  d3,  #1
171
        pld             [r1]
172
        pld             [r1, r2]
173
        subs            r3,  r3,  #2
174
        vswp            d1,  d2
175
        \vhadd          q0,  q0,  q1
176
        vst1.64         {d0},      [r0,:64], r2
177
        vst1.64         {d1},      [r0,:64], r2
178
        bne             1b
179
        bx              lr
180
        .endm
181

    
182
        .macro pixels8_y2 vhadd=vrhadd.u8
183
        push            {lr}
184
        add             ip,  r1,  r2
185
        lsl             lr,  r2,  #1
186
        vld1.64         {d0},      [r1], lr
187
        vld1.64         {d1},      [ip], lr
188
1:      subs            r3,  r3,  #2
189
        \vhadd          d4,  d0,  d1
190
        vld1.64         {d0},      [r1],     lr
191
        \vhadd          d5,  d0,  d1
192
        vld1.64         {d1},      [ip],     lr
193
        pld             [r1]
194
        pld             [ip]
195
        vst1.64         {d4},      [r0,:64], r2
196
        vst1.64         {d5},      [r0,:64], r2
197
        bne             1b
198
        pop             {pc}
199
        .endm
200

    
201
        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
202
        push            {lr}
203
        lsl             lr,  r2,  #1
204
        add             ip,  r1,  r2
205
        vld1.64         {d0, d1},  [r1], lr
206
        vld1.64         {d2, d3},  [ip], lr
207
.if \no_rnd
208
        vmov.i16        q11, #1
209
.endif
210
        pld             [r1]
211
        pld             [ip]
212
        vext.8          d4,  d0,  d1,  #1
213
        vext.8          d6,  d2,  d3,  #1
214
        vaddl.u8        q8,  d0,  d4
215
        vaddl.u8        q9,  d2,  d6
216
1:      subs            r3,  r3,  #2
217
        vld1.64         {d0, d1},  [r1], lr
218
        pld             [r1]
219
        vadd.u16        q10, q8,  q9
220
        vext.8          d4,  d0,  d1,  #1
221
.if \no_rnd
222
        vadd.u16        q10, q10, q11
223
.endif
224
        vaddl.u8        q8,  d0,  d4
225
        \vshrn          d5,  q10, #2
226
        vld1.64         {d2, d3},  [ip], lr
227
        vadd.u16        q10, q8,  q9
228
        pld             [ip]
229
.if \no_rnd
230
        vadd.u16        q10, q10, q11
231
.endif
232
        vst1.64         {d5},      [r0,:64], r2
233
        \vshrn          d7,  q10, #2
234
        vext.8          d6,  d2,  d3,  #1
235
        vaddl.u8        q9,  d2,  d6
236
        vst1.64         {d7},      [r0,:64], r2
237
        bgt             1b
238
        pop             {pc}
239
        .endm
240

    
241
        .macro pixfunc pfx name suf rnd_op args:vararg
242
function ff_\pfx\name\suf\()_neon, export=1
243
        \name \rnd_op \args
244
        .endfunc
245
        .endm
246

    
247
        .macro pixfunc2 pfx name args:vararg
248
        pixfunc \pfx \name
249
        pixfunc \pfx \name \args
250
        .endm
251

    
252
function ff_put_h264_qpel16_mc00_neon, export=1
253
        mov   r3, #16
254
        .endfunc
255

    
256
        pixfunc  put_ pixels16
257
        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
258
        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
259
        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
260

    
261
function ff_avg_h264_qpel16_mc00_neon, export=1
262
        mov   r3, #16
263
        .endfunc
264

    
265
        pixfunc  avg_ pixels16,, 1
266

    
267
function ff_put_h264_qpel8_mc00_neon, export=1
268
        mov   r3, #8
269
        .endfunc
270

    
271
        pixfunc  put_ pixels8
272
        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
273
        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
274
        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
275

    
276
function ff_float_to_int16_neon, export=1
277
        subs            r2,  r2,  #8
278
        vld1.64         {d0-d1},  [r1,:128]!
279
        vcvt.s32.f32    q8,  q0,  #16
280
        vld1.64         {d2-d3},  [r1,:128]!
281
        vcvt.s32.f32    q9,  q1,  #16
282
        beq             3f
283
        bics            ip,  r2,  #15
284
        beq             2f
285
1:      subs            ip,  ip,  #16
286
        vshrn.s32       d4,  q8,  #16
287
        vld1.64         {d0-d1},  [r1,:128]!
288
        vcvt.s32.f32    q0,  q0,  #16
289
        vshrn.s32       d5,  q9,  #16
290
        vld1.64         {d2-d3},  [r1,:128]!
291
        vcvt.s32.f32    q1,  q1,  #16
292
        vshrn.s32       d6,  q0,  #16
293
        vst1.64         {d4-d5},  [r0,:128]!
294
        vshrn.s32       d7,  q1,  #16
295
        vld1.64         {d16-d17},[r1,:128]!
296
        vcvt.s32.f32    q8,  q8,  #16
297
        vld1.64         {d18-d19},[r1,:128]!
298
        vcvt.s32.f32    q9,  q9,  #16
299
        vst1.64         {d6-d7},  [r0,:128]!
300
        bne             1b
301
        ands            r2,  r2,  #15
302
        beq             3f
303
2:      vld1.64         {d0-d1},  [r1,:128]!
304
        vshrn.s32       d4,  q8,  #16
305
        vcvt.s32.f32    q0,  q0,  #16
306
        vld1.64         {d2-d3},  [r1,:128]!
307
        vshrn.s32       d5,  q9,  #16
308
        vcvt.s32.f32    q1,  q1,  #16
309
        vshrn.s32       d6,  q0,  #16
310
        vst1.64         {d4-d5},  [r0,:128]!
311
        vshrn.s32       d7,  q1,  #16
312
        vst1.64         {d6-d7},  [r0,:128]!
313
        bx              lr
314
3:      vshrn.s32       d4,  q8,  #16
315
        vshrn.s32       d5,  q9,  #16
316
        vst1.64         {d4-d5},  [r0,:128]!
317
        bx              lr
318
        .endfunc
319

    
320
function ff_float_to_int16_interleave_neon, export=1
321
        cmp             r3, #2
322
        ldrlt           r1, [r1]
323
        blt             ff_float_to_int16_neon
324
        bne             4f
325

    
326
        ldr             r3, [r1]
327
        ldr             r1, [r1, #4]
328

    
329
        subs            r2,  r2,  #8
330
        vld1.64         {d0-d1},  [r3,:128]!
331
        vcvt.s32.f32    q8,  q0,  #16
332
        vld1.64         {d2-d3},  [r3,:128]!
333
        vcvt.s32.f32    q9,  q1,  #16
334
        vld1.64         {d20-d21},[r1,:128]!
335
        vcvt.s32.f32    q10, q10, #16
336
        vld1.64         {d22-d23},[r1,:128]!
337
        vcvt.s32.f32    q11, q11, #16
338
        beq             3f
339
        bics            ip,  r2,  #15
340
        beq             2f
341
1:      subs            ip,  ip,  #16
342
        vld1.64         {d0-d1},  [r3,:128]!
343
        vcvt.s32.f32    q0,  q0,  #16
344
        vsri.32         q10, q8,  #16
345
        vld1.64         {d2-d3},  [r3,:128]!
346
        vcvt.s32.f32    q1,  q1,  #16
347
        vld1.64         {d24-d25},[r1,:128]!
348
        vcvt.s32.f32    q12, q12, #16
349
        vld1.64         {d26-d27},[r1,:128]!
350
        vsri.32         q11, q9,  #16
351
        vst1.64         {d20-d21},[r0,:128]!
352
        vcvt.s32.f32    q13, q13, #16
353
        vst1.64         {d22-d23},[r0,:128]!
354
        vsri.32         q12, q0,  #16
355
        vld1.64         {d16-d17},[r3,:128]!
356
        vsri.32         q13, q1,  #16
357
        vst1.64         {d24-d25},[r0,:128]!
358
        vcvt.s32.f32    q8,  q8,  #16
359
        vld1.64         {d18-d19},[r3,:128]!
360
        vcvt.s32.f32    q9,  q9,  #16
361
        vld1.64         {d20-d21},[r1,:128]!
362
        vcvt.s32.f32    q10, q10, #16
363
        vld1.64         {d22-d23},[r1,:128]!
364
        vcvt.s32.f32    q11, q11, #16
365
        vst1.64         {d26-d27},[r0,:128]!
366
        bne             1b
367
        ands            r2,  r2,  #15
368
        beq             3f
369
2:      vsri.32         q10, q8,  #16
370
        vld1.64         {d0-d1},  [r3,:128]!
371
        vcvt.s32.f32    q0,  q0,  #16
372
        vld1.64         {d2-d3},  [r3,:128]!
373
        vcvt.s32.f32    q1,  q1,  #16
374
        vld1.64         {d24-d25},[r1,:128]!
375
        vcvt.s32.f32    q12, q12, #16
376
        vsri.32         q11, q9,  #16
377
        vld1.64         {d26-d27},[r1,:128]!
378
        vcvt.s32.f32    q13, q13, #16
379
        vst1.64         {d20-d21},[r0,:128]!
380
        vsri.32         q12, q0,  #16
381
        vst1.64         {d22-d23},[r0,:128]!
382
        vsri.32         q13, q1,  #16
383
        vst1.64         {d24-d27},[r0,:128]!
384
        bx              lr
385
3:      vsri.32         q10, q8,  #16
386
        vsri.32         q11, q9,  #16
387
        vst1.64         {d20-d23},[r0,:128]!
388
        bx              lr
389

    
390
4:      push            {r4-r8,lr}
391
        cmp             r3,  #4
392
        lsl             ip,  r3,  #1
393
        blt             4f
394

    
395
        @ 4 channels
396
5:      ldmia           r1!, {r4-r7}
397
        mov             lr,  r2
398
        mov             r8,  r0
399
        vld1.64         {d16-d17},[r4,:128]!
400
        vcvt.s32.f32    q8,  q8,  #16
401
        vld1.64         {d18-d19},[r5,:128]!
402
        vcvt.s32.f32    q9,  q9,  #16
403
        vld1.64         {d20-d21},[r6,:128]!
404
        vcvt.s32.f32    q10, q10, #16
405
        vld1.64         {d22-d23},[r7,:128]!
406
        vcvt.s32.f32    q11, q11, #16
407
6:      subs            lr,  lr,  #8
408
        vld1.64         {d0-d1},  [r4,:128]!
409
        vcvt.s32.f32    q0,  q0,  #16
410
        vsri.32         q9,  q8,  #16
411
        vld1.64         {d2-d3},  [r5,:128]!
412
        vcvt.s32.f32    q1,  q1,  #16
413
        vsri.32         q11, q10, #16
414
        vld1.64         {d4-d5},  [r6,:128]!
415
        vcvt.s32.f32    q2,  q2,  #16
416
        vzip.32         d18, d22
417
        vld1.64         {d6-d7},  [r7,:128]!
418
        vcvt.s32.f32    q3,  q3,  #16
419
        vzip.32         d19, d23
420
        vst1.64         {d18},    [r8], ip
421
        vsri.32         q1,  q0,  #16
422
        vst1.64         {d22},    [r8], ip
423
        vsri.32         q3,  q2,  #16
424
        vst1.64         {d19},    [r8], ip
425
        vzip.32         d2,  d6
426
        vst1.64         {d23},    [r8], ip
427
        vzip.32         d3,  d7
428
        beq             7f
429
        vld1.64         {d16-d17},[r4,:128]!
430
        vcvt.s32.f32    q8,  q8,  #16
431
        vst1.64         {d2},     [r8], ip
432
        vld1.64         {d18-d19},[r5,:128]!
433
        vcvt.s32.f32    q9,  q9,  #16
434
        vst1.64         {d6},     [r8], ip
435
        vld1.64         {d20-d21},[r6,:128]!
436
        vcvt.s32.f32    q10, q10, #16
437
        vst1.64         {d3},     [r8], ip
438
        vld1.64         {d22-d23},[r7,:128]!
439
        vcvt.s32.f32    q11, q11, #16
440
        vst1.64         {d7},     [r8], ip
441
        b               6b
442
7:      vst1.64         {d2},     [r8], ip
443
        vst1.64         {d6},     [r8], ip
444
        vst1.64         {d3},     [r8], ip
445
        vst1.64         {d7},     [r8], ip
446
        subs            r3,  r3,  #4
447
        popeq           {r4-r8,pc}
448
        cmp             r3,  #4
449
        add             r0,  r0,  #8
450
        bge             5b
451

    
452
        @ 2 channels
453
4:      cmp             r3,  #2
454
        blt             4f
455
        ldmia           r1!, {r4-r5}
456
        mov             lr,  r2
457
        mov             r8,  r0
458
        tst             lr,  #8
459
        vld1.64         {d16-d17},[r4,:128]!
460
        vcvt.s32.f32    q8,  q8,  #16
461
        vld1.64         {d18-d19},[r5,:128]!
462
        vcvt.s32.f32    q9,  q9,  #16
463
        vld1.64         {d20-d21},[r4,:128]!
464
        vcvt.s32.f32    q10, q10, #16
465
        vld1.64         {d22-d23},[r5,:128]!
466
        vcvt.s32.f32    q11, q11, #16
467
        beq             6f
468
        subs            lr,  lr,  #8
469
        beq             7f
470
        vsri.32         d18, d16, #16
471
        vsri.32         d19, d17, #16
472
        vld1.64         {d16-d17},[r4,:128]!
473
        vcvt.s32.f32    q8,  q8,  #16
474
        vst1.32         {d18[0]}, [r8], ip
475
        vsri.32         d22, d20, #16
476
        vst1.32         {d18[1]}, [r8], ip
477
        vsri.32         d23, d21, #16
478
        vst1.32         {d19[0]}, [r8], ip
479
        vst1.32         {d19[1]}, [r8], ip
480
        vld1.64         {d18-d19},[r5,:128]!
481
        vcvt.s32.f32    q9,  q9,  #16
482
        vst1.32         {d22[0]}, [r8], ip
483
        vst1.32         {d22[1]}, [r8], ip
484
        vld1.64         {d20-d21},[r4,:128]!
485
        vcvt.s32.f32    q10, q10, #16
486
        vst1.32         {d23[0]}, [r8], ip
487
        vst1.32         {d23[1]}, [r8], ip
488
        vld1.64         {d22-d23},[r5,:128]!
489
        vcvt.s32.f32    q11, q11, #16
490
6:      subs            lr,  lr,  #16
491
        vld1.64         {d0-d1},  [r4,:128]!
492
        vcvt.s32.f32    q0,  q0,  #16
493
        vsri.32         d18, d16, #16
494
        vld1.64         {d2-d3},  [r5,:128]!
495
        vcvt.s32.f32    q1,  q1,  #16
496
        vsri.32         d19, d17, #16
497
        vld1.64         {d4-d5},  [r4,:128]!
498
        vcvt.s32.f32    q2,  q2,  #16
499
        vld1.64         {d6-d7},  [r5,:128]!
500
        vcvt.s32.f32    q3,  q3,  #16
501
        vst1.32         {d18[0]}, [r8], ip
502
        vsri.32         d22, d20, #16
503
        vst1.32         {d18[1]}, [r8], ip
504
        vsri.32         d23, d21, #16
505
        vst1.32         {d19[0]}, [r8], ip
506
        vsri.32         d2,  d0,  #16
507
        vst1.32         {d19[1]}, [r8], ip
508
        vsri.32         d3,  d1,  #16
509
        vst1.32         {d22[0]}, [r8], ip
510
        vsri.32         d6,  d4,  #16
511
        vst1.32         {d22[1]}, [r8], ip
512
        vsri.32         d7,  d5,  #16
513
        vst1.32         {d23[0]}, [r8], ip
514
        vst1.32         {d23[1]}, [r8], ip
515
        beq             6f
516
        vld1.64         {d16-d17},[r4,:128]!
517
        vcvt.s32.f32    q8,  q8,  #16
518
        vst1.32         {d2[0]},  [r8], ip
519
        vst1.32         {d2[1]},  [r8], ip
520
        vld1.64         {d18-d19},[r5,:128]!
521
        vcvt.s32.f32    q9,  q9,  #16
522
        vst1.32         {d3[0]},  [r8], ip
523
        vst1.32         {d3[1]},  [r8], ip
524
        vld1.64         {d20-d21},[r4,:128]!
525
        vcvt.s32.f32    q10, q10, #16
526
        vst1.32         {d6[0]},  [r8], ip
527
        vst1.32         {d6[1]},  [r8], ip
528
        vld1.64         {d22-d23},[r5,:128]!
529
        vcvt.s32.f32    q11, q11, #16
530
        vst1.32         {d7[0]},  [r8], ip
531
        vst1.32         {d7[1]},  [r8], ip
532
        bgt             6b
533
6:      vst1.32         {d2[0]},  [r8], ip
534
        vst1.32         {d2[1]},  [r8], ip
535
        vst1.32         {d3[0]},  [r8], ip
536
        vst1.32         {d3[1]},  [r8], ip
537
        vst1.32         {d6[0]},  [r8], ip
538
        vst1.32         {d6[1]},  [r8], ip
539
        vst1.32         {d7[0]},  [r8], ip
540
        vst1.32         {d7[1]},  [r8], ip
541
        b               8f
542
7:      vsri.32         d18, d16, #16
543
        vsri.32         d19, d17, #16
544
        vst1.32         {d18[0]}, [r8], ip
545
        vsri.32         d22, d20, #16
546
        vst1.32         {d18[1]}, [r8], ip
547
        vsri.32         d23, d21, #16
548
        vst1.32         {d19[0]}, [r8], ip
549
        vst1.32         {d19[1]}, [r8], ip
550
        vst1.32         {d22[0]}, [r8], ip
551
        vst1.32         {d22[1]}, [r8], ip
552
        vst1.32         {d23[0]}, [r8], ip
553
        vst1.32         {d23[1]}, [r8], ip
554
8:      subs            r3,  r3,  #2
555
        add             r0,  r0,  #4
556
        popeq           {r4-r8,pc}
557

    
558
        @ 1 channel
559
4:      ldr             r4,  [r1],#4
560
        tst             r2,  #8
561
        mov             lr,  r2
562
        mov             r5,  r0
563
        vld1.64         {d0-d1},  [r4,:128]!
564
        vcvt.s32.f32    q0,  q0,  #16
565
        vld1.64         {d2-d3},  [r4,:128]!
566
        vcvt.s32.f32    q1,  q1,  #16
567
        bne             8f
568
6:      subs            lr,  lr,  #16
569
        vld1.64         {d4-d5},  [r4,:128]!
570
        vcvt.s32.f32    q2,  q2,  #16
571
        vld1.64         {d6-d7},  [r4,:128]!
572
        vcvt.s32.f32    q3,  q3,  #16
573
        vst1.16         {d0[1]},  [r5,:16], ip
574
        vst1.16         {d0[3]},  [r5,:16], ip
575
        vst1.16         {d1[1]},  [r5,:16], ip
576
        vst1.16         {d1[3]},  [r5,:16], ip
577
        vst1.16         {d2[1]},  [r5,:16], ip
578
        vst1.16         {d2[3]},  [r5,:16], ip
579
        vst1.16         {d3[1]},  [r5,:16], ip
580
        vst1.16         {d3[3]},  [r5,:16], ip
581
        beq             7f
582
        vld1.64         {d0-d1},  [r4,:128]!
583
        vcvt.s32.f32    q0,  q0,  #16
584
        vld1.64         {d2-d3},  [r4,:128]!
585
        vcvt.s32.f32    q1,  q1,  #16
586
7:      vst1.16         {d4[1]},  [r5,:16], ip
587
        vst1.16         {d4[3]},  [r5,:16], ip
588
        vst1.16         {d5[1]},  [r5,:16], ip
589
        vst1.16         {d5[3]},  [r5,:16], ip
590
        vst1.16         {d6[1]},  [r5,:16], ip
591
        vst1.16         {d6[3]},  [r5,:16], ip
592
        vst1.16         {d7[1]},  [r5,:16], ip
593
        vst1.16         {d7[3]},  [r5,:16], ip
594
        bgt             6b
595
        pop             {r4-r8,pc}
596
8:      subs            lr,  lr,  #8
597
        vst1.16         {d0[1]},  [r5,:16], ip
598
        vst1.16         {d0[3]},  [r5,:16], ip
599
        vst1.16         {d1[1]},  [r5,:16], ip
600
        vst1.16         {d1[3]},  [r5,:16], ip
601
        vst1.16         {d2[1]},  [r5,:16], ip
602
        vst1.16         {d2[3]},  [r5,:16], ip
603
        vst1.16         {d3[1]},  [r5,:16], ip
604
        vst1.16         {d3[3]},  [r5,:16], ip
605
        popeq           {r4-r8,pc}
606
        vld1.64         {d0-d1},  [r4,:128]!
607
        vcvt.s32.f32    q0,  q0,  #16
608
        vld1.64         {d2-d3},  [r4,:128]!
609
        vcvt.s32.f32    q1,  q1,  #16
610
        b               6b
611
        .endfunc
612

    
613
function ff_vector_fmul_neon, export=1
614
        mov             r3,  r0
615
        subs            r2,  r2,  #8
616
        vld1.64         {d0-d3},  [r0,:128]!
617
        vld1.64         {d4-d7},  [r1,:128]!
618
        vmul.f32        q8,  q0,  q2
619
        vmul.f32        q9,  q1,  q3
620
        beq             3f
621
        bics            ip,  r2,  #15
622
        beq             2f
623
1:      subs            ip,  ip,  #16
624
        vld1.64         {d0-d1},  [r0,:128]!
625
        vld1.64         {d4-d5},  [r1,:128]!
626
        vmul.f32        q10, q0,  q2
627
        vld1.64         {d2-d3},  [r0,:128]!
628
        vld1.64         {d6-d7},  [r1,:128]!
629
        vmul.f32        q11, q1,  q3
630
        vst1.64         {d16-d19},[r3,:128]!
631
        vld1.64         {d0-d1},  [r0,:128]!
632
        vld1.64         {d4-d5},  [r1,:128]!
633
        vmul.f32        q8,  q0,  q2
634
        vld1.64         {d2-d3},  [r0,:128]!
635
        vld1.64         {d6-d7},  [r1,:128]!
636
        vmul.f32        q9,  q1,  q3
637
        vst1.64         {d20-d23},[r3,:128]!
638
        bne             1b
639
        ands            r2,  r2,  #15
640
        beq             3f
641
2:      vld1.64         {d0-d1},  [r0,:128]!
642
        vld1.64         {d4-d5},  [r1,:128]!
643
        vst1.64         {d16-d17},[r3,:128]!
644
        vmul.f32        q8,  q0,  q2
645
        vld1.64         {d2-d3},  [r0,:128]!
646
        vld1.64         {d6-d7},  [r1,:128]!
647
        vst1.64         {d18-d19},[r3,:128]!
648
        vmul.f32        q9,  q1,  q3
649
3:      vst1.64         {d16-d19},[r3,:128]!
650
        bx              lr
651
        .endfunc
652

    
653
function ff_vector_fmul_window_neon, export=1
654
        vld1.32         {d16[],d17[]}, [sp,:32]
655
        push            {r4,r5,lr}
656
        ldr             lr,  [sp, #16]
657
        sub             r2,  r2,  #8
658
        sub             r5,  lr,  #2
659
        add             r2,  r2,  r5, lsl #2
660
        add             r4,  r3,  r5, lsl #3
661
        add             ip,  r0,  r5, lsl #3
662
        mov             r5,  #-16
663
        vld1.64         {d0,d1},  [r1,:128]!
664
        vld1.64         {d2,d3},  [r2,:128], r5
665
        vld1.64         {d4,d5},  [r3,:128]!
666
        vld1.64         {d6,d7},  [r4,:128], r5
667
1:      subs            lr,  lr,  #4
668
        vmov            q11, q8
669
        vmla.f32        d22, d0,  d4
670
        vmov            q10, q8
671
        vmla.f32        d23, d1,  d5
672
        vrev64.32       q3,  q3
673
        vmla.f32        d20, d0,  d7
674
        vrev64.32       q1,  q1
675
        vmla.f32        d21, d1,  d6
676
        beq             2f
677
        vmla.f32        d22, d3,  d7
678
        vld1.64         {d0,d1},  [r1,:128]!
679
        vmla.f32        d23, d2,  d6
680
        vld1.64         {d18,d19},[r2,:128], r5
681
        vmls.f32        d20, d3,  d4
682
        vld1.64         {d24,d25},[r3,:128]!
683
        vmls.f32        d21, d2,  d5
684
        vld1.64         {d6,d7},  [r4,:128], r5
685
        vmov            q1,  q9
686
        vrev64.32       q11, q11
687
        vmov            q2,  q12
688
        vswp            d22, d23
689
        vst1.64         {d20,d21},[r0,:128]!
690
        vst1.64         {d22,d23},[ip,:128], r5
691
        b               1b
692
2:      vmla.f32        d22, d3,  d7
693
        vmla.f32        d23, d2,  d6
694
        vmls.f32        d20, d3,  d4
695
        vmls.f32        d21, d2,  d5
696
        vrev64.32       q11, q11
697
        vswp            d22, d23
698
        vst1.64         {d20,d21},[r0,:128]!
699
        vst1.64         {d22,d23},[ip,:128], r5
700
        pop             {r4,r5,pc}
701
        .endfunc