Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / dsputil_neon_s.S @ 0a8958c8

History | View | Annotate | Download (28.4 KB)

1
/*
2
 * ARM NEON optimised DSP functions
3
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8
 * modify it under the terms of the GNU Lesser General Public
9
 * License as published by the Free Software Foundation; either
10
 * version 2.1 of the License, or (at your option) any later version.
11
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
 * Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
 */
21

    
22
#include "asm.S"
23

    
24
        preserve8
25
        .fpu neon
26
        .text
27

    
28
        .macro pixels16 avg=0
29
.if \avg
30
        mov             ip,  r0
31
.endif
32
1:      vld1.64         {d0, d1},  [r1], r2
33
        vld1.64         {d2, d3},  [r1], r2
34
        vld1.64         {d4, d5},  [r1], r2
35
        pld             [r1, r2, lsl #2]
36
        vld1.64         {d6, d7},  [r1], r2
37
        pld             [r1]
38
        pld             [r1, r2]
39
        pld             [r1, r2, lsl #1]
40
.if \avg
41
        vld1.64         {d16,d17}, [ip,:128], r2
42
        vrhadd.u8       q0,  q0,  q8
43
        vld1.64         {d18,d19}, [ip,:128], r2
44
        vrhadd.u8       q1,  q1,  q9
45
        vld1.64         {d20,d21}, [ip,:128], r2
46
        vrhadd.u8       q2,  q2,  q10
47
        vld1.64         {d22,d23}, [ip,:128], r2
48
        vrhadd.u8       q3,  q3,  q11
49
.endif
50
        subs            r3,  r3,  #4
51
        vst1.64         {d0, d1},  [r0,:128], r2
52
        vst1.64         {d2, d3},  [r0,:128], r2
53
        vst1.64         {d4, d5},  [r0,:128], r2
54
        vst1.64         {d6, d7},  [r0,:128], r2
55
        bne             1b
56
        bx              lr
57
        .endm
58

    
59
        .macro pixels16_x2 vhadd=vrhadd.u8
60
1:      vld1.64         {d0-d2},   [r1], r2
61
        vld1.64         {d4-d6},   [r1], r2
62
        pld             [r1]
63
        pld             [r1, r2]
64
        subs            r3,  r3,  #2
65
        vext.8          q1,  q0,  q1,  #1
66
        \vhadd          q0,  q0,  q1
67
        vext.8          q3,  q2,  q3,  #1
68
        \vhadd          q2,  q2,  q3
69
        vst1.64         {d0, d1},  [r0,:128], r2
70
        vst1.64         {d4, d5},  [r0,:128], r2
71
        bne             1b
72
        bx              lr
73
        .endm
74

    
75
        .macro pixels16_y2 vhadd=vrhadd.u8
76
        vld1.64         {d0, d1},  [r1], r2
77
        vld1.64         {d2, d3},  [r1], r2
78
1:      subs            r3,  r3,  #2
79
        \vhadd          q2,  q0,  q1
80
        vld1.64         {d0, d1},  [r1], r2
81
        \vhadd          q3,  q0,  q1
82
        vld1.64         {d2, d3},  [r1], r2
83
        pld             [r1]
84
        pld             [r1, r2]
85
        vst1.64         {d4, d5},  [r0,:128], r2
86
        vst1.64         {d6, d7},  [r0,:128], r2
87
        bne             1b
88
        bx              lr
89
        .endm
90

    
91
        .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92
        vld1.64         {d0-d2},   [r1], r2
93
        vld1.64         {d4-d6},   [r1], r2
94
.if \no_rnd
95
        vmov.i16        q13, #1
96
.endif
97
        pld             [r1]
98
        pld             [r1, r2]
99
        vext.8          q1,  q0,  q1,  #1
100
        vext.8          q3,  q2,  q3,  #1
101
        vaddl.u8        q8,  d0,  d2
102
        vaddl.u8        q10, d1,  d3
103
        vaddl.u8        q9,  d4,  d6
104
        vaddl.u8        q11, d5,  d7
105
1:      subs            r3,  r3,  #2
106
        vld1.64         {d0-d2},   [r1], r2
107
        vadd.u16        q12, q8,  q9
108
        pld             [r1]
109
.if \no_rnd
110
        vadd.u16        q12, q12, q13
111
.endif
112
        vext.8          q15, q0,  q1,  #1
113
        vadd.u16        q1 , q10, q11
114
        \vshrn          d28, q12, #2
115
.if \no_rnd
116
        vadd.u16        q1,  q1,  q13
117
.endif
118
        \vshrn          d29, q1,  #2
119
        vaddl.u8        q8,  d0,  d30
120
        vld1.64         {d2-d4},   [r1], r2
121
        vaddl.u8        q10, d1,  d31
122
        vst1.64         {d28,d29}, [r0,:128], r2
123
        vadd.u16        q12, q8,  q9
124
        pld             [r1, r2]
125
.if \no_rnd
126
        vadd.u16        q12, q12, q13
127
.endif
128
        vext.8          q2,  q1,  q2,  #1
129
        vadd.u16        q0,  q10, q11
130
        \vshrn          d30, q12, #2
131
.if \no_rnd
132
        vadd.u16        q0,  q0,  q13
133
.endif
134
        \vshrn          d31, q0,  #2
135
        vaddl.u8        q9,  d2,  d4
136
        vaddl.u8        q11, d3,  d5
137
        vst1.64         {d30,d31}, [r0,:128], r2
138
        bgt             1b
139
        bx              lr
140
        .endm
141

    
142
        .macro pixels8
143
1:      vld1.64         {d0}, [r1], r2
144
        vld1.64         {d1}, [r1], r2
145
        vld1.64         {d2}, [r1], r2
146
        pld             [r1, r2, lsl #2]
147
        vld1.64         {d3}, [r1], r2
148
        pld             [r1]
149
        pld             [r1, r2]
150
        pld             [r1, r2, lsl #1]
151
        subs            r3,  r3,  #4
152
        vst1.64         {d0}, [r0,:64], r2
153
        vst1.64         {d1}, [r0,:64], r2
154
        vst1.64         {d2}, [r0,:64], r2
155
        vst1.64         {d3}, [r0,:64], r2
156
        bne             1b
157
        bx              lr
158
        .endm
159

    
160
        .macro pixels8_x2 vhadd=vrhadd.u8
161
1:      vld1.64         {d0, d1},  [r1], r2
162
        vext.8          d1,  d0,  d1,  #1
163
        vld1.64         {d2, d3},  [r1], r2
164
        vext.8          d3,  d2,  d3,  #1
165
        pld             [r1]
166
        pld             [r1, r2]
167
        subs            r3,  r3,  #2
168
        vswp            d1,  d2
169
        \vhadd          q0,  q0,  q1
170
        vst1.64         {d0},      [r0,:64], r2
171
        vst1.64         {d1},      [r0,:64], r2
172
        bne             1b
173
        bx              lr
174
        .endm
175

    
176
        .macro pixels8_y2 vhadd=vrhadd.u8
177
        vld1.64         {d0},      [r1], r2
178
        vld1.64         {d1},      [r1], r2
179
1:      subs            r3,  r3,  #2
180
        \vhadd          d4,  d0,  d1
181
        vld1.64         {d0},      [r1], r2
182
        \vhadd          d5,  d0,  d1
183
        vld1.64         {d1},      [r1], r2
184
        pld             [r1]
185
        pld             [r1, r2]
186
        vst1.64         {d4},      [r0,:64], r2
187
        vst1.64         {d5},      [r0,:64], r2
188
        bne             1b
189
        bx              lr
190
        .endm
191

    
192
        .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
193
        vld1.64         {d0, d1},  [r1], r2
194
        vld1.64         {d2, d3},  [r1], r2
195
.if \no_rnd
196
        vmov.i16        q11, #1
197
.endif
198
        pld             [r1]
199
        pld             [r1, r2]
200
        vext.8          d4,  d0,  d1,  #1
201
        vext.8          d6,  d2,  d3,  #1
202
        vaddl.u8        q8,  d0,  d4
203
        vaddl.u8        q9,  d2,  d6
204
1:      subs            r3,  r3,  #2
205
        vld1.64         {d0, d1},  [r1], r2
206
        pld             [r1]
207
        vadd.u16        q10, q8,  q9
208
        vext.8          d4,  d0,  d1,  #1
209
.if \no_rnd
210
        vadd.u16        q10, q10, q11
211
.endif
212
        vaddl.u8        q8,  d0,  d4
213
        \vshrn          d5,  q10, #2
214
        vld1.64         {d2, d3},  [r1], r2
215
        vadd.u16        q10, q8,  q9
216
        pld             [r1, r2]
217
.if \no_rnd
218
        vadd.u16        q10, q10, q11
219
.endif
220
        vst1.64         {d5},      [r0,:64], r2
221
        \vshrn          d7,  q10, #2
222
        vext.8          d6,  d2,  d3,  #1
223
        vaddl.u8        q9,  d2,  d6
224
        vst1.64         {d7},      [r0,:64], r2
225
        bgt             1b
226
        bx              lr
227
        .endm
228

    
229
        .macro pixfunc pfx name suf rnd_op args:vararg
230
function ff_\pfx\name\suf\()_neon, export=1
231
        \name \rnd_op \args
232
        .endfunc
233
        .endm
234

    
235
        .macro pixfunc2 pfx name args:vararg
236
        pixfunc \pfx \name
237
        pixfunc \pfx \name \args
238
        .endm
239

    
240
function ff_put_h264_qpel16_mc00_neon, export=1
241
        mov   r3, #16
242
        .endfunc
243

    
244
        pixfunc  put_ pixels16
245
        pixfunc2 put_ pixels16_x2,  _no_rnd, vhadd.u8
246
        pixfunc2 put_ pixels16_y2,  _no_rnd, vhadd.u8
247
        pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
248

    
249
function ff_avg_h264_qpel16_mc00_neon, export=1
250
        mov   r3, #16
251
        .endfunc
252

    
253
        pixfunc  avg_ pixels16,, 1
254

    
255
function ff_put_h264_qpel8_mc00_neon, export=1
256
        mov   r3, #8
257
        .endfunc
258

    
259
        pixfunc  put_ pixels8
260
        pixfunc2 put_ pixels8_x2,   _no_rnd, vhadd.u8
261
        pixfunc2 put_ pixels8_y2,   _no_rnd, vhadd.u8
262
        pixfunc2 put_ pixels8_xy2,  _no_rnd, vshrn.u16, 1
263

    
264
function ff_put_pixels_clamped_neon, export=1
265
        vld1.64         {d16-d19}, [r0,:128]!
266
        vqmovun.s16     d0, q8
267
        vld1.64         {d20-d23}, [r0,:128]!
268
        vqmovun.s16     d1, q9
269
        vld1.64         {d24-d27}, [r0,:128]!
270
        vqmovun.s16     d2, q10
271
        vld1.64         {d28-d31}, [r0,:128]!
272
        vqmovun.s16     d3, q11
273
        vst1.64         {d0},      [r1,:64], r2
274
        vqmovun.s16     d4, q12
275
        vst1.64         {d1},      [r1,:64], r2
276
        vqmovun.s16     d5, q13
277
        vst1.64         {d2},      [r1,:64], r2
278
        vqmovun.s16     d6, q14
279
        vst1.64         {d3},      [r1,:64], r2
280
        vqmovun.s16     d7, q15
281
        vst1.64         {d4},      [r1,:64], r2
282
        vst1.64         {d5},      [r1,:64], r2
283
        vst1.64         {d6},      [r1,:64], r2
284
        vst1.64         {d7},      [r1,:64], r2
285
        bx              lr
286
        .endfunc
287

    
288
function ff_put_signed_pixels_clamped_neon, export=1
289
        vmov.u8         d31, #128
290
        vld1.64         {d16-d17}, [r0,:128]!
291
        vqmovn.s16      d0, q8
292
        vld1.64         {d18-d19}, [r0,:128]!
293
        vqmovn.s16      d1, q9
294
        vld1.64         {d16-d17}, [r0,:128]!
295
        vqmovn.s16      d2, q8
296
        vld1.64         {d18-d19}, [r0,:128]!
297
        vadd.u8         d0, d0, d31
298
        vld1.64         {d20-d21}, [r0,:128]!
299
        vadd.u8         d1, d1, d31
300
        vld1.64         {d22-d23}, [r0,:128]!
301
        vadd.u8         d2, d2, d31
302
        vst1.64         {d0},      [r1,:64], r2
303
        vqmovn.s16      d3, q9
304
        vst1.64         {d1},      [r1,:64], r2
305
        vqmovn.s16      d4, q10
306
        vst1.64         {d2},      [r1,:64], r2
307
        vqmovn.s16      d5, q11
308
        vld1.64         {d24-d25}, [r0,:128]!
309
        vadd.u8         d3, d3, d31
310
        vld1.64         {d26-d27}, [r0,:128]!
311
        vadd.u8         d4, d4, d31
312
        vadd.u8         d5, d5, d31
313
        vst1.64         {d3},      [r1,:64], r2
314
        vqmovn.s16      d6, q12
315
        vst1.64         {d4},      [r1,:64], r2
316
        vqmovn.s16      d7, q13
317
        vst1.64         {d5},      [r1,:64], r2
318
        vadd.u8         d6, d6, d31
319
        vadd.u8         d7, d7, d31
320
        vst1.64         {d6},      [r1,:64], r2
321
        vst1.64         {d7},      [r1,:64], r2
322
        bx              lr
323
        .endfunc
324

    
325
function ff_add_pixels_clamped_neon, export=1
326
        mov             r3, r1
327
        vld1.64         {d16},   [r1,:64], r2
328
        vld1.64         {d0-d1}, [r0,:128]!
329
        vaddw.u8        q0, q0, d16
330
        vld1.64         {d17},   [r1,:64], r2
331
        vld1.64         {d2-d3}, [r0,:128]!
332
        vqmovun.s16     d0, q0
333
        vld1.64         {d18},   [r1,:64], r2
334
        vaddw.u8        q1, q1, d17
335
        vld1.64         {d4-d5}, [r0,:128]!
336
        vaddw.u8        q2, q2, d18
337
        vst1.64         {d0},    [r3,:64], r2
338
        vqmovun.s16     d2, q1
339
        vld1.64         {d19},   [r1,:64], r2
340
        vld1.64         {d6-d7}, [r0,:128]!
341
        vaddw.u8        q3, q3, d19
342
        vqmovun.s16     d4, q2
343
        vst1.64         {d2},    [r3,:64], r2
344
        vld1.64         {d16},   [r1,:64], r2
345
        vqmovun.s16     d6, q3
346
        vld1.64         {d0-d1}, [r0,:128]!
347
        vaddw.u8        q0, q0, d16
348
        vst1.64         {d4},    [r3,:64], r2
349
        vld1.64         {d17},   [r1,:64], r2
350
        vld1.64         {d2-d3}, [r0,:128]!
351
        vaddw.u8        q1, q1, d17
352
        vst1.64         {d6},    [r3,:64], r2
353
        vqmovun.s16     d0, q0
354
        vld1.64         {d18},   [r1,:64], r2
355
        vld1.64         {d4-d5}, [r0,:128]!
356
        vaddw.u8        q2, q2, d18
357
        vst1.64         {d0},    [r3,:64], r2
358
        vqmovun.s16     d2, q1
359
        vld1.64         {d19},   [r1,:64], r2
360
        vqmovun.s16     d4, q2
361
        vld1.64         {d6-d7}, [r0,:128]!
362
        vaddw.u8        q3, q3, d19
363
        vst1.64         {d2},    [r3,:64], r2
364
        vqmovun.s16     d6, q3
365
        vst1.64         {d4},    [r3,:64], r2
366
        vst1.64         {d6},    [r3,:64], r2
367
        bx              lr
368
        .endfunc
369

    
370
function ff_float_to_int16_neon, export=1
371
        subs            r2,  r2,  #8
372
        vld1.64         {d0-d1},  [r1,:128]!
373
        vcvt.s32.f32    q8,  q0,  #16
374
        vld1.64         {d2-d3},  [r1,:128]!
375
        vcvt.s32.f32    q9,  q1,  #16
376
        beq             3f
377
        bics            ip,  r2,  #15
378
        beq             2f
379
1:      subs            ip,  ip,  #16
380
        vshrn.s32       d4,  q8,  #16
381
        vld1.64         {d0-d1},  [r1,:128]!
382
        vcvt.s32.f32    q0,  q0,  #16
383
        vshrn.s32       d5,  q9,  #16
384
        vld1.64         {d2-d3},  [r1,:128]!
385
        vcvt.s32.f32    q1,  q1,  #16
386
        vshrn.s32       d6,  q0,  #16
387
        vst1.64         {d4-d5},  [r0,:128]!
388
        vshrn.s32       d7,  q1,  #16
389
        vld1.64         {d16-d17},[r1,:128]!
390
        vcvt.s32.f32    q8,  q8,  #16
391
        vld1.64         {d18-d19},[r1,:128]!
392
        vcvt.s32.f32    q9,  q9,  #16
393
        vst1.64         {d6-d7},  [r0,:128]!
394
        bne             1b
395
        ands            r2,  r2,  #15
396
        beq             3f
397
2:      vld1.64         {d0-d1},  [r1,:128]!
398
        vshrn.s32       d4,  q8,  #16
399
        vcvt.s32.f32    q0,  q0,  #16
400
        vld1.64         {d2-d3},  [r1,:128]!
401
        vshrn.s32       d5,  q9,  #16
402
        vcvt.s32.f32    q1,  q1,  #16
403
        vshrn.s32       d6,  q0,  #16
404
        vst1.64         {d4-d5},  [r0,:128]!
405
        vshrn.s32       d7,  q1,  #16
406
        vst1.64         {d6-d7},  [r0,:128]!
407
        bx              lr
408
3:      vshrn.s32       d4,  q8,  #16
409
        vshrn.s32       d5,  q9,  #16
410
        vst1.64         {d4-d5},  [r0,:128]!
411
        bx              lr
412
        .endfunc
413

    
414
function ff_float_to_int16_interleave_neon, export=1
415
        cmp             r3, #2
416
        ldrlt           r1, [r1]
417
        blt             ff_float_to_int16_neon
418
        bne             4f
419

    
420
        ldr             r3, [r1]
421
        ldr             r1, [r1, #4]
422

    
423
        subs            r2,  r2,  #8
424
        vld1.64         {d0-d1},  [r3,:128]!
425
        vcvt.s32.f32    q8,  q0,  #16
426
        vld1.64         {d2-d3},  [r3,:128]!
427
        vcvt.s32.f32    q9,  q1,  #16
428
        vld1.64         {d20-d21},[r1,:128]!
429
        vcvt.s32.f32    q10, q10, #16
430
        vld1.64         {d22-d23},[r1,:128]!
431
        vcvt.s32.f32    q11, q11, #16
432
        beq             3f
433
        bics            ip,  r2,  #15
434
        beq             2f
435
1:      subs            ip,  ip,  #16
436
        vld1.64         {d0-d1},  [r3,:128]!
437
        vcvt.s32.f32    q0,  q0,  #16
438
        vsri.32         q10, q8,  #16
439
        vld1.64         {d2-d3},  [r3,:128]!
440
        vcvt.s32.f32    q1,  q1,  #16
441
        vld1.64         {d24-d25},[r1,:128]!
442
        vcvt.s32.f32    q12, q12, #16
443
        vld1.64         {d26-d27},[r1,:128]!
444
        vsri.32         q11, q9,  #16
445
        vst1.64         {d20-d21},[r0,:128]!
446
        vcvt.s32.f32    q13, q13, #16
447
        vst1.64         {d22-d23},[r0,:128]!
448
        vsri.32         q12, q0,  #16
449
        vld1.64         {d16-d17},[r3,:128]!
450
        vsri.32         q13, q1,  #16
451
        vst1.64         {d24-d25},[r0,:128]!
452
        vcvt.s32.f32    q8,  q8,  #16
453
        vld1.64         {d18-d19},[r3,:128]!
454
        vcvt.s32.f32    q9,  q9,  #16
455
        vld1.64         {d20-d21},[r1,:128]!
456
        vcvt.s32.f32    q10, q10, #16
457
        vld1.64         {d22-d23},[r1,:128]!
458
        vcvt.s32.f32    q11, q11, #16
459
        vst1.64         {d26-d27},[r0,:128]!
460
        bne             1b
461
        ands            r2,  r2,  #15
462
        beq             3f
463
2:      vsri.32         q10, q8,  #16
464
        vld1.64         {d0-d1},  [r3,:128]!
465
        vcvt.s32.f32    q0,  q0,  #16
466
        vld1.64         {d2-d3},  [r3,:128]!
467
        vcvt.s32.f32    q1,  q1,  #16
468
        vld1.64         {d24-d25},[r1,:128]!
469
        vcvt.s32.f32    q12, q12, #16
470
        vsri.32         q11, q9,  #16
471
        vld1.64         {d26-d27},[r1,:128]!
472
        vcvt.s32.f32    q13, q13, #16
473
        vst1.64         {d20-d21},[r0,:128]!
474
        vsri.32         q12, q0,  #16
475
        vst1.64         {d22-d23},[r0,:128]!
476
        vsri.32         q13, q1,  #16
477
        vst1.64         {d24-d27},[r0,:128]!
478
        bx              lr
479
3:      vsri.32         q10, q8,  #16
480
        vsri.32         q11, q9,  #16
481
        vst1.64         {d20-d23},[r0,:128]!
482
        bx              lr
483

    
484
4:      push            {r4-r8,lr}
485
        cmp             r3,  #4
486
        lsl             ip,  r3,  #1
487
        blt             4f
488

    
489
        @ 4 channels
490
5:      ldmia           r1!, {r4-r7}
491
        mov             lr,  r2
492
        mov             r8,  r0
493
        vld1.64         {d16-d17},[r4,:128]!
494
        vcvt.s32.f32    q8,  q8,  #16
495
        vld1.64         {d18-d19},[r5,:128]!
496
        vcvt.s32.f32    q9,  q9,  #16
497
        vld1.64         {d20-d21},[r6,:128]!
498
        vcvt.s32.f32    q10, q10, #16
499
        vld1.64         {d22-d23},[r7,:128]!
500
        vcvt.s32.f32    q11, q11, #16
501
6:      subs            lr,  lr,  #8
502
        vld1.64         {d0-d1},  [r4,:128]!
503
        vcvt.s32.f32    q0,  q0,  #16
504
        vsri.32         q9,  q8,  #16
505
        vld1.64         {d2-d3},  [r5,:128]!
506
        vcvt.s32.f32    q1,  q1,  #16
507
        vsri.32         q11, q10, #16
508
        vld1.64         {d4-d5},  [r6,:128]!
509
        vcvt.s32.f32    q2,  q2,  #16
510
        vzip.32         d18, d22
511
        vld1.64         {d6-d7},  [r7,:128]!
512
        vcvt.s32.f32    q3,  q3,  #16
513
        vzip.32         d19, d23
514
        vst1.64         {d18},    [r8], ip
515
        vsri.32         q1,  q0,  #16
516
        vst1.64         {d22},    [r8], ip
517
        vsri.32         q3,  q2,  #16
518
        vst1.64         {d19},    [r8], ip
519
        vzip.32         d2,  d6
520
        vst1.64         {d23},    [r8], ip
521
        vzip.32         d3,  d7
522
        beq             7f
523
        vld1.64         {d16-d17},[r4,:128]!
524
        vcvt.s32.f32    q8,  q8,  #16
525
        vst1.64         {d2},     [r8], ip
526
        vld1.64         {d18-d19},[r5,:128]!
527
        vcvt.s32.f32    q9,  q9,  #16
528
        vst1.64         {d6},     [r8], ip
529
        vld1.64         {d20-d21},[r6,:128]!
530
        vcvt.s32.f32    q10, q10, #16
531
        vst1.64         {d3},     [r8], ip
532
        vld1.64         {d22-d23},[r7,:128]!
533
        vcvt.s32.f32    q11, q11, #16
534
        vst1.64         {d7},     [r8], ip
535
        b               6b
536
7:      vst1.64         {d2},     [r8], ip
537
        vst1.64         {d6},     [r8], ip
538
        vst1.64         {d3},     [r8], ip
539
        vst1.64         {d7},     [r8], ip
540
        subs            r3,  r3,  #4
541
        popeq           {r4-r8,pc}
542
        cmp             r3,  #4
543
        add             r0,  r0,  #8
544
        bge             5b
545

    
546
        @ 2 channels
547
4:      cmp             r3,  #2
548
        blt             4f
549
        ldmia           r1!, {r4-r5}
550
        mov             lr,  r2
551
        mov             r8,  r0
552
        tst             lr,  #8
553
        vld1.64         {d16-d17},[r4,:128]!
554
        vcvt.s32.f32    q8,  q8,  #16
555
        vld1.64         {d18-d19},[r5,:128]!
556
        vcvt.s32.f32    q9,  q9,  #16
557
        vld1.64         {d20-d21},[r4,:128]!
558
        vcvt.s32.f32    q10, q10, #16
559
        vld1.64         {d22-d23},[r5,:128]!
560
        vcvt.s32.f32    q11, q11, #16
561
        beq             6f
562
        subs            lr,  lr,  #8
563
        beq             7f
564
        vsri.32         d18, d16, #16
565
        vsri.32         d19, d17, #16
566
        vld1.64         {d16-d17},[r4,:128]!
567
        vcvt.s32.f32    q8,  q8,  #16
568
        vst1.32         {d18[0]}, [r8], ip
569
        vsri.32         d22, d20, #16
570
        vst1.32         {d18[1]}, [r8], ip
571
        vsri.32         d23, d21, #16
572
        vst1.32         {d19[0]}, [r8], ip
573
        vst1.32         {d19[1]}, [r8], ip
574
        vld1.64         {d18-d19},[r5,:128]!
575
        vcvt.s32.f32    q9,  q9,  #16
576
        vst1.32         {d22[0]}, [r8], ip
577
        vst1.32         {d22[1]}, [r8], ip
578
        vld1.64         {d20-d21},[r4,:128]!
579
        vcvt.s32.f32    q10, q10, #16
580
        vst1.32         {d23[0]}, [r8], ip
581
        vst1.32         {d23[1]}, [r8], ip
582
        vld1.64         {d22-d23},[r5,:128]!
583
        vcvt.s32.f32    q11, q11, #16
584
6:      subs            lr,  lr,  #16
585
        vld1.64         {d0-d1},  [r4,:128]!
586
        vcvt.s32.f32    q0,  q0,  #16
587
        vsri.32         d18, d16, #16
588
        vld1.64         {d2-d3},  [r5,:128]!
589
        vcvt.s32.f32    q1,  q1,  #16
590
        vsri.32         d19, d17, #16
591
        vld1.64         {d4-d5},  [r4,:128]!
592
        vcvt.s32.f32    q2,  q2,  #16
593
        vld1.64         {d6-d7},  [r5,:128]!
594
        vcvt.s32.f32    q3,  q3,  #16
595
        vst1.32         {d18[0]}, [r8], ip
596
        vsri.32         d22, d20, #16
597
        vst1.32         {d18[1]}, [r8], ip
598
        vsri.32         d23, d21, #16
599
        vst1.32         {d19[0]}, [r8], ip
600
        vsri.32         d2,  d0,  #16
601
        vst1.32         {d19[1]}, [r8], ip
602
        vsri.32         d3,  d1,  #16
603
        vst1.32         {d22[0]}, [r8], ip
604
        vsri.32         d6,  d4,  #16
605
        vst1.32         {d22[1]}, [r8], ip
606
        vsri.32         d7,  d5,  #16
607
        vst1.32         {d23[0]}, [r8], ip
608
        vst1.32         {d23[1]}, [r8], ip
609
        beq             6f
610
        vld1.64         {d16-d17},[r4,:128]!
611
        vcvt.s32.f32    q8,  q8,  #16
612
        vst1.32         {d2[0]},  [r8], ip
613
        vst1.32         {d2[1]},  [r8], ip
614
        vld1.64         {d18-d19},[r5,:128]!
615
        vcvt.s32.f32    q9,  q9,  #16
616
        vst1.32         {d3[0]},  [r8], ip
617
        vst1.32         {d3[1]},  [r8], ip
618
        vld1.64         {d20-d21},[r4,:128]!
619
        vcvt.s32.f32    q10, q10, #16
620
        vst1.32         {d6[0]},  [r8], ip
621
        vst1.32         {d6[1]},  [r8], ip
622
        vld1.64         {d22-d23},[r5,:128]!
623
        vcvt.s32.f32    q11, q11, #16
624
        vst1.32         {d7[0]},  [r8], ip
625
        vst1.32         {d7[1]},  [r8], ip
626
        bgt             6b
627
6:      vst1.32         {d2[0]},  [r8], ip
628
        vst1.32         {d2[1]},  [r8], ip
629
        vst1.32         {d3[0]},  [r8], ip
630
        vst1.32         {d3[1]},  [r8], ip
631
        vst1.32         {d6[0]},  [r8], ip
632
        vst1.32         {d6[1]},  [r8], ip
633
        vst1.32         {d7[0]},  [r8], ip
634
        vst1.32         {d7[1]},  [r8], ip
635
        b               8f
636
7:      vsri.32         d18, d16, #16
637
        vsri.32         d19, d17, #16
638
        vst1.32         {d18[0]}, [r8], ip
639
        vsri.32         d22, d20, #16
640
        vst1.32         {d18[1]}, [r8], ip
641
        vsri.32         d23, d21, #16
642
        vst1.32         {d19[0]}, [r8], ip
643
        vst1.32         {d19[1]}, [r8], ip
644
        vst1.32         {d22[0]}, [r8], ip
645
        vst1.32         {d22[1]}, [r8], ip
646
        vst1.32         {d23[0]}, [r8], ip
647
        vst1.32         {d23[1]}, [r8], ip
648
8:      subs            r3,  r3,  #2
649
        add             r0,  r0,  #4
650
        popeq           {r4-r8,pc}
651

    
652
        @ 1 channel
653
4:      ldr             r4,  [r1],#4
654
        tst             r2,  #8
655
        mov             lr,  r2
656
        mov             r5,  r0
657
        vld1.64         {d0-d1},  [r4,:128]!
658
        vcvt.s32.f32    q0,  q0,  #16
659
        vld1.64         {d2-d3},  [r4,:128]!
660
        vcvt.s32.f32    q1,  q1,  #16
661
        bne             8f
662
6:      subs            lr,  lr,  #16
663
        vld1.64         {d4-d5},  [r4,:128]!
664
        vcvt.s32.f32    q2,  q2,  #16
665
        vld1.64         {d6-d7},  [r4,:128]!
666
        vcvt.s32.f32    q3,  q3,  #16
667
        vst1.16         {d0[1]},  [r5,:16], ip
668
        vst1.16         {d0[3]},  [r5,:16], ip
669
        vst1.16         {d1[1]},  [r5,:16], ip
670
        vst1.16         {d1[3]},  [r5,:16], ip
671
        vst1.16         {d2[1]},  [r5,:16], ip
672
        vst1.16         {d2[3]},  [r5,:16], ip
673
        vst1.16         {d3[1]},  [r5,:16], ip
674
        vst1.16         {d3[3]},  [r5,:16], ip
675
        beq             7f
676
        vld1.64         {d0-d1},  [r4,:128]!
677
        vcvt.s32.f32    q0,  q0,  #16
678
        vld1.64         {d2-d3},  [r4,:128]!
679
        vcvt.s32.f32    q1,  q1,  #16
680
7:      vst1.16         {d4[1]},  [r5,:16], ip
681
        vst1.16         {d4[3]},  [r5,:16], ip
682
        vst1.16         {d5[1]},  [r5,:16], ip
683
        vst1.16         {d5[3]},  [r5,:16], ip
684
        vst1.16         {d6[1]},  [r5,:16], ip
685
        vst1.16         {d6[3]},  [r5,:16], ip
686
        vst1.16         {d7[1]},  [r5,:16], ip
687
        vst1.16         {d7[3]},  [r5,:16], ip
688
        bgt             6b
689
        pop             {r4-r8,pc}
690
8:      subs            lr,  lr,  #8
691
        vst1.16         {d0[1]},  [r5,:16], ip
692
        vst1.16         {d0[3]},  [r5,:16], ip
693
        vst1.16         {d1[1]},  [r5,:16], ip
694
        vst1.16         {d1[3]},  [r5,:16], ip
695
        vst1.16         {d2[1]},  [r5,:16], ip
696
        vst1.16         {d2[3]},  [r5,:16], ip
697
        vst1.16         {d3[1]},  [r5,:16], ip
698
        vst1.16         {d3[3]},  [r5,:16], ip
699
        popeq           {r4-r8,pc}
700
        vld1.64         {d0-d1},  [r4,:128]!
701
        vcvt.s32.f32    q0,  q0,  #16
702
        vld1.64         {d2-d3},  [r4,:128]!
703
        vcvt.s32.f32    q1,  q1,  #16
704
        b               6b
705
        .endfunc
706

    
707
function ff_vector_fmul_neon, export=1
708
        mov             r3,  r0
709
        subs            r2,  r2,  #8
710
        vld1.64         {d0-d3},  [r0,:128]!
711
        vld1.64         {d4-d7},  [r1,:128]!
712
        vmul.f32        q8,  q0,  q2
713
        vmul.f32        q9,  q1,  q3
714
        beq             3f
715
        bics            ip,  r2,  #15
716
        beq             2f
717
1:      subs            ip,  ip,  #16
718
        vld1.64         {d0-d1},  [r0,:128]!
719
        vld1.64         {d4-d5},  [r1,:128]!
720
        vmul.f32        q10, q0,  q2
721
        vld1.64         {d2-d3},  [r0,:128]!
722
        vld1.64         {d6-d7},  [r1,:128]!
723
        vmul.f32        q11, q1,  q3
724
        vst1.64         {d16-d19},[r3,:128]!
725
        vld1.64         {d0-d1},  [r0,:128]!
726
        vld1.64         {d4-d5},  [r1,:128]!
727
        vmul.f32        q8,  q0,  q2
728
        vld1.64         {d2-d3},  [r0,:128]!
729
        vld1.64         {d6-d7},  [r1,:128]!
730
        vmul.f32        q9,  q1,  q3
731
        vst1.64         {d20-d23},[r3,:128]!
732
        bne             1b
733
        ands            r2,  r2,  #15
734
        beq             3f
735
2:      vld1.64         {d0-d1},  [r0,:128]!
736
        vld1.64         {d4-d5},  [r1,:128]!
737
        vst1.64         {d16-d17},[r3,:128]!
738
        vmul.f32        q8,  q0,  q2
739
        vld1.64         {d2-d3},  [r0,:128]!
740
        vld1.64         {d6-d7},  [r1,:128]!
741
        vst1.64         {d18-d19},[r3,:128]!
742
        vmul.f32        q9,  q1,  q3
743
3:      vst1.64         {d16-d19},[r3,:128]!
744
        bx              lr
745
        .endfunc
746

    
747
function ff_vector_fmul_window_neon, export=1
748
VFP     vdup.32         q8,  d0[0]
749
NOVFP   vld1.32         {d16[],d17[]}, [sp,:32]
750
        push            {r4,r5,lr}
751
VFP     ldr             lr,  [sp, #12]
752
NOVFP   ldr             lr,  [sp, #16]
753
        sub             r2,  r2,  #8
754
        sub             r5,  lr,  #2
755
        add             r2,  r2,  r5, lsl #2
756
        add             r4,  r3,  r5, lsl #3
757
        add             ip,  r0,  r5, lsl #3
758
        mov             r5,  #-16
759
        vld1.64         {d0,d1},  [r1,:128]!
760
        vld1.64         {d2,d3},  [r2,:128], r5
761
        vld1.64         {d4,d5},  [r3,:128]!
762
        vld1.64         {d6,d7},  [r4,:128], r5
763
1:      subs            lr,  lr,  #4
764
        vmov            q11, q8
765
        vmla.f32        d22, d0,  d4
766
        vmov            q10, q8
767
        vmla.f32        d23, d1,  d5
768
        vrev64.32       q3,  q3
769
        vmla.f32        d20, d0,  d7
770
        vrev64.32       q1,  q1
771
        vmla.f32        d21, d1,  d6
772
        beq             2f
773
        vmla.f32        d22, d3,  d7
774
        vld1.64         {d0,d1},  [r1,:128]!
775
        vmla.f32        d23, d2,  d6
776
        vld1.64         {d18,d19},[r2,:128], r5
777
        vmls.f32        d20, d3,  d4
778
        vld1.64         {d24,d25},[r3,:128]!
779
        vmls.f32        d21, d2,  d5
780
        vld1.64         {d6,d7},  [r4,:128], r5
781
        vmov            q1,  q9
782
        vrev64.32       q11, q11
783
        vmov            q2,  q12
784
        vswp            d22, d23
785
        vst1.64         {d20,d21},[r0,:128]!
786
        vst1.64         {d22,d23},[ip,:128], r5
787
        b               1b
788
2:      vmla.f32        d22, d3,  d7
789
        vmla.f32        d23, d2,  d6
790
        vmls.f32        d20, d3,  d4
791
        vmls.f32        d21, d2,  d5
792
        vrev64.32       q11, q11
793
        vswp            d22, d23
794
        vst1.64         {d20,d21},[r0,:128]!
795
        vst1.64         {d22,d23},[ip,:128], r5
796
        pop             {r4,r5,pc}
797
        .endfunc