Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264dsp_neon.S @ a2fc0f6a

History | View | Annotate | Download (45.5 KB)

1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        .fpu neon
24

    
25
        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26
        vtrn.32         \r0, \r4
27
        vtrn.32         \r1, \r5
28
        vtrn.32         \r2, \r6
29
        vtrn.32         \r3, \r7
30
        vtrn.16         \r0, \r2
31
        vtrn.16         \r1, \r3
32
        vtrn.16         \r4, \r6
33
        vtrn.16         \r5, \r7
34
        vtrn.8          \r0, \r1
35
        vtrn.8          \r2, \r3
36
        vtrn.8          \r4, \r5
37
        vtrn.8          \r6, \r7
38
        .endm
39

    
40
        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
41
        vswp            \r0, \r4
42
        vswp            \r1, \r5
43
        vswp            \r2, \r6
44
        vswp            \r3, \r7
45
        .endm
46

    
47
        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
48
        vtrn.32         \r0, \r2
49
        vtrn.32         \r1, \r3
50
        vtrn.32         \r4, \r6
51
        vtrn.32         \r5, \r7
52
        vtrn.16         \r0, \r1
53
        vtrn.16         \r2, \r3
54
        vtrn.16         \r4, \r5
55
        vtrn.16         \r6, \r7
56
        .endm
57

    
58
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
59
        .macro  h264_chroma_mc8 avg=0
60
        push            {r4-r7, lr}
61
        ldrd            r4,  [sp, #20]
62
.if \avg
63
        mov             lr,  r0
64
.endif
65
        pld             [r1]
66
        pld             [r1, r2]
67

    
68
        muls            r7,  r4,  r5
69
        rsb             r6,  r7,  r5,  lsl #3
70
        rsb             ip,  r7,  r4,  lsl #3
71
        sub             r4,  r7,  r4,  lsl #3
72
        sub             r4,  r4,  r5,  lsl #3
73
        add             r4,  r4,  #64
74

    
75
        beq             2f
76

    
77
        add             r5,  r1,  r2
78

    
79
        vdup.8          d0,  r4
80
        lsl             r4,  r2,  #1
81
        vdup.8          d1,  ip
82
        vld1.64         {d4, d5}, [r1], r4
83
        vdup.8          d2,  r6
84
        vld1.64         {d6, d7}, [r5], r4
85
        vdup.8          d3,  r7
86

    
87
        vext.8          d5,  d4,  d5,  #1
88
        vext.8          d7,  d6,  d7,  #1
89

    
90
1:      pld             [r5]
91
        vmull.u8        q8,  d4,  d0
92
        vmlal.u8        q8,  d5,  d1
93
        vld1.64         {d4, d5}, [r1], r4
94
        vmlal.u8        q8,  d6,  d2
95
        vext.8          d5,  d4,  d5,  #1
96
        vmlal.u8        q8,  d7,  d3
97
        vmull.u8        q9,  d6,  d0
98
        subs            r3,  r3,  #2
99
        vmlal.u8        q9,  d7,  d1
100
        vmlal.u8        q9,  d4,  d2
101
        vmlal.u8        q9,  d5,  d3
102
        vrshrn.u16      d16, q8,  #6
103
        vld1.64         {d6, d7}, [r5], r4
104
        pld             [r1]
105
        vrshrn.u16      d17, q9,  #6
106
.if \avg
107
        vld1.64         {d20}, [lr,:64], r2
108
        vld1.64         {d21}, [lr,:64], r2
109
        vrhadd.u8       q8,  q8,  q10
110
.endif
111
        vext.8          d7,  d6,  d7,  #1
112
        vst1.64         {d16}, [r0,:64], r2
113
        vst1.64         {d17}, [r0,:64], r2
114
        bgt             1b
115

    
116
        pop             {r4-r7, pc}
117

    
118
2:      tst             r6,  r6
119
        add             ip,  ip,  r6
120
        vdup.8          d0,  r4
121
        vdup.8          d1,  ip
122

    
123
        beq             4f
124

    
125
        add             r5,  r1,  r2
126
        lsl             r4,  r2,  #1
127
        vld1.64         {d4}, [r1], r4
128
        vld1.64         {d6}, [r5], r4
129

    
130
3:      pld             [r5]
131
        vmull.u8        q8,  d4,  d0
132
        vmlal.u8        q8,  d6,  d1
133
        vld1.64         {d4}, [r1], r4
134
        vmull.u8        q9,  d6,  d0
135
        vmlal.u8        q9,  d4,  d1
136
        vld1.64         {d6}, [r5], r4
137
        vrshrn.u16      d16, q8,  #6
138
        vrshrn.u16      d17, q9,  #6
139
.if \avg
140
        vld1.64         {d20}, [lr,:64], r2
141
        vld1.64         {d21}, [lr,:64], r2
142
        vrhadd.u8       q8,  q8,  q10
143
.endif
144
        subs            r3,  r3,  #2
145
        pld             [r1]
146
        vst1.64         {d16}, [r0,:64], r2
147
        vst1.64         {d17}, [r0,:64], r2
148
        bgt             3b
149

    
150
        pop             {r4-r7, pc}
151

    
152
4:      vld1.64         {d4, d5}, [r1], r2
153
        vld1.64         {d6, d7}, [r1], r2
154
        vext.8          d5,  d4,  d5,  #1
155
        vext.8          d7,  d6,  d7,  #1
156

    
157
5:      pld             [r1]
158
        subs            r3,  r3,  #2
159
        vmull.u8        q8,  d4,  d0
160
        vmlal.u8        q8,  d5,  d1
161
        vld1.64         {d4, d5}, [r1], r2
162
        vmull.u8        q9,  d6,  d0
163
        vmlal.u8        q9,  d7,  d1
164
        pld             [r1]
165
        vext.8          d5,  d4,  d5,  #1
166
        vrshrn.u16      d16, q8,  #6
167
        vrshrn.u16      d17, q9,  #6
168
.if \avg
169
        vld1.64         {d20}, [lr,:64], r2
170
        vld1.64         {d21}, [lr,:64], r2
171
        vrhadd.u8       q8,  q8,  q10
172
.endif
173
        vld1.64         {d6, d7}, [r1], r2
174
        vext.8          d7,  d6,  d7,  #1
175
        vst1.64         {d16}, [r0,:64], r2
176
        vst1.64         {d17}, [r0,:64], r2
177
        bgt             5b
178

    
179
        pop             {r4-r7, pc}
180
        .endm
181

    
182
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
183
        .macro  h264_chroma_mc4 avg=0
184
        push            {r4-r7, lr}
185
        ldrd            r4,  [sp, #20]
186
.if \avg
187
        mov             lr,  r0
188
.endif
189
        pld             [r1]
190
        pld             [r1, r2]
191

    
192
        muls            r7,  r4,  r5
193
        rsb             r6,  r7,  r5,  lsl #3
194
        rsb             ip,  r7,  r4,  lsl #3
195
        sub             r4,  r7,  r4,  lsl #3
196
        sub             r4,  r4,  r5,  lsl #3
197
        add             r4,  r4,  #64
198

    
199
        beq             2f
200

    
201
        add             r5,  r1,  r2
202

    
203
        vdup.8          d0,  r4
204
        lsl             r4,  r2,  #1
205
        vdup.8          d1,  ip
206
        vld1.64         {d4},     [r1], r4
207
        vdup.8          d2,  r6
208
        vld1.64         {d6},     [r5], r4
209
        vdup.8          d3,  r7
210

    
211
        vext.8          d5,  d4,  d5,  #1
212
        vext.8          d7,  d6,  d7,  #1
213
        vtrn.32         d4,  d5
214
        vtrn.32         d6,  d7
215

    
216
        vtrn.32         d0,  d1
217
        vtrn.32         d2,  d3
218

    
219
1:      pld             [r5]
220
        vmull.u8        q8,  d4,  d0
221
        vmlal.u8        q8,  d6,  d2
222
        vld1.64         {d4},     [r1], r4
223
        vext.8          d5,  d4,  d5,  #1
224
        vtrn.32         d4,  d5
225
        vmull.u8        q9,  d6,  d0
226
        vmlal.u8        q9,  d4,  d2
227
        vld1.64         {d6},     [r5], r4
228
        vadd.i16        d16, d16, d17
229
        vadd.i16        d17, d18, d19
230
        vrshrn.u16      d16, q8,  #6
231
        subs            r3,  r3,  #2
232
        pld             [r1]
233
.if \avg
234
        vld1.32         {d20[0]}, [lr,:32], r2
235
        vld1.32         {d20[1]}, [lr,:32], r2
236
        vrhadd.u8       d16, d16, d20
237
.endif
238
        vext.8          d7,  d6,  d7,  #1
239
        vtrn.32         d6,  d7
240
        vst1.32         {d16[0]}, [r0,:32], r2
241
        vst1.32         {d16[1]}, [r0,:32], r2
242
        bgt             1b
243

    
244
        pop             {r4-r7, pc}
245

    
246
2:      tst             r6,  r6
247
        add             ip,  ip,  r6
248
        vdup.8          d0,  r4
249
        vdup.8          d1,  ip
250
        vtrn.32         d0,  d1
251

    
252
        beq             4f
253

    
254
        vext.32         d1,  d0,  d1,  #1
255
        add             r5,  r1,  r2
256
        lsl             r4,  r2,  #1
257
        vld1.32         {d4[0]},  [r1], r4
258
        vld1.32         {d4[1]},  [r5], r4
259

    
260
3:      pld             [r5]
261
        vmull.u8        q8,  d4,  d0
262
        vld1.32         {d4[0]},  [r1], r4
263
        vmull.u8        q9,  d4,  d1
264
        vld1.32         {d4[1]},  [r5], r4
265
        vadd.i16        d16, d16, d17
266
        vadd.i16        d17, d18, d19
267
        vrshrn.u16      d16, q8,  #6
268
.if \avg
269
        vld1.32         {d20[0]}, [lr,:32], r2
270
        vld1.32         {d20[1]}, [lr,:32], r2
271
        vrhadd.u8       d16, d16, d20
272
.endif
273
        subs            r3,  r3,  #2
274
        pld             [r1]
275
        vst1.32         {d16[0]}, [r0,:32], r2
276
        vst1.32         {d16[1]}, [r0,:32], r2
277
        bgt             3b
278

    
279
        pop             {r4-r7, pc}
280

    
281
4:      vld1.64         {d4},     [r1], r2
282
        vld1.64         {d6},     [r1], r2
283
        vext.8          d5,  d4,  d5,  #1
284
        vext.8          d7,  d6,  d7,  #1
285
        vtrn.32         d4,  d5
286
        vtrn.32         d6,  d7
287

    
288
5:      vmull.u8        q8,  d4,  d0
289
        vmull.u8        q9,  d6,  d0
290
        subs            r3,  r3,  #2
291
        vld1.64         {d4},     [r1], r2
292
        vext.8          d5,  d4,  d5,  #1
293
        vtrn.32         d4,  d5
294
        vadd.i16        d16, d16, d17
295
        vadd.i16        d17, d18, d19
296
        pld             [r1]
297
        vrshrn.u16      d16, q8,  #6
298
.if \avg
299
        vld1.32         {d20[0]}, [lr,:32], r2
300
        vld1.32         {d20[1]}, [lr,:32], r2
301
        vrhadd.u8       d16, d16, d20
302
.endif
303
        vld1.64         {d6},     [r1], r2
304
        vext.8          d7,  d6,  d7,  #1
305
        vtrn.32         d6,  d7
306
        pld             [r1]
307
        vst1.32         {d16[0]}, [r0,:32], r2
308
        vst1.32         {d16[1]}, [r0,:32], r2
309
        bgt             5b
310

    
311
        pop             {r4-r7, pc}
312
        .endm
313

    
314
        .text
315
        .align
316

    
317
function ff_put_h264_chroma_mc8_neon, export=1
318
        h264_chroma_mc8
319
        .endfunc
320

    
321
function ff_avg_h264_chroma_mc8_neon, export=1
322
        h264_chroma_mc8 avg=1
323
        .endfunc
324

    
325
function ff_put_h264_chroma_mc4_neon, export=1
326
        h264_chroma_mc4
327
        .endfunc
328

    
329
function ff_avg_h264_chroma_mc4_neon, export=1
330
        h264_chroma_mc4 avg=1
331
        .endfunc
332

    
333
        /* H.264 loop filter */
334

    
335
        .macro h264_loop_filter_start
336
        ldr             ip,  [sp]
337
        tst             r2,  r2
338
        ldr             ip,  [ip]
339
        tstne           r3,  r3
340
        vmov.32         d24[0], ip
341
        and             ip,  ip,  ip, lsl #16
342
        bxeq            lr
343
        ands            ip,  ip,  ip, lsl #8
344
        bxlt            lr
345
        .endm
346

    
347
        .macro align_push_regs
348
        and             ip,  sp,  #15
349
        add             ip,  ip,  #32
350
        sub             sp,  sp,  ip
351
        vst1.64         {d12-d15}, [sp,:128]
352
        sub             sp,  sp,  #32
353
        vst1.64         {d8-d11},  [sp,:128]
354
        .endm
355

    
356
        .macro align_pop_regs
357
        vld1.64         {d8-d11},  [sp,:128]!
358
        vld1.64         {d12-d15}, [sp,:128], ip
359
        .endm
360

    
361
        .macro h264_loop_filter_luma
362
        vdup.8          q11, r2         @ alpha
363
        vmovl.u8        q12, d24
364
        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
365
        vmovl.u16       q12, d24
366
        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
367
        vsli.16         q12, q12, #8
368
        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
369
        vsli.32         q12, q12, #16
370
        vclt.u8         q6,  q6,  q11   @ < alpha
371
        vdup.8          q11, r3         @ beta
372
        vclt.s8         q7,  q12, #0
373
        vclt.u8         q14, q14, q11   @ < beta
374
        vclt.u8         q15, q15, q11   @ < beta
375
        vbic            q6,  q6,  q7
376
        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
377
        vand            q6,  q6,  q14
378
        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
379
        vclt.u8         q4,  q4,  q11   @ < beta
380
        vand            q6,  q6,  q15
381
        vclt.u8         q5,  q5,  q11   @ < beta
382
        vand            q4,  q4,  q6
383
        vand            q5,  q5,  q6
384
        vand            q12, q12, q6
385
        vrhadd.u8       q14, q8,  q0
386
        vsub.i8         q6,  q12, q4
387
        vqadd.u8        q7,  q9,  q12
388
        vhadd.u8        q10, q10, q14
389
        vsub.i8         q6,  q6,  q5
390
        vhadd.u8        q14, q2,  q14
391
        vmin.u8         q7,  q7,  q10
392
        vqsub.u8        q11, q9,  q12
393
        vqadd.u8        q2,  q1,  q12
394
        vmax.u8         q7,  q7,  q11
395
        vqsub.u8        q11, q1,  q12
396
        vmin.u8         q14, q2,  q14
397
        vmovl.u8        q2,  d0
398
        vmax.u8         q14, q14, q11
399
        vmovl.u8        q10, d1
400
        vsubw.u8        q2,  q2,  d16
401
        vsubw.u8        q10, q10, d17
402
        vshl.i16        q2,  q2,  #2
403
        vshl.i16        q10, q10, #2
404
        vaddw.u8        q2,  q2,  d18
405
        vaddw.u8        q10, q10, d19
406
        vsubw.u8        q2,  q2,  d2
407
        vsubw.u8        q10, q10, d3
408
        vrshrn.i16      d4,  q2,  #3
409
        vrshrn.i16      d5,  q10, #3
410
        vbsl            q4,  q7,  q9
411
        vbsl            q5,  q14, q1
412
        vneg.s8         q7,  q6
413
        vmovl.u8        q14, d16
414
        vmin.s8         q2,  q2,  q6
415
        vmovl.u8        q6,  d17
416
        vmax.s8         q2,  q2,  q7
417
        vmovl.u8        q11, d0
418
        vmovl.u8        q12, d1
419
        vaddw.s8        q14, q14, d4
420
        vaddw.s8        q6,  q6,  d5
421
        vsubw.s8        q11, q11, d4
422
        vsubw.s8        q12, q12, d5
423
        vqmovun.s16     d16, q14
424
        vqmovun.s16     d17, q6
425
        vqmovun.s16     d0,  q11
426
        vqmovun.s16     d1,  q12
427
        .endm
428

    
429
function ff_h264_v_loop_filter_luma_neon, export=1
430
        h264_loop_filter_start
431

    
432
        vld1.64         {d0, d1},  [r0,:128], r1
433
        vld1.64         {d2, d3},  [r0,:128], r1
434
        vld1.64         {d4, d5},  [r0,:128], r1
435
        sub             r0,  r0,  r1, lsl #2
436
        sub             r0,  r0,  r1, lsl #1
437
        vld1.64         {d20,d21}, [r0,:128], r1
438
        vld1.64         {d18,d19}, [r0,:128], r1
439
        vld1.64         {d16,d17}, [r0,:128], r1
440

    
441
        align_push_regs
442

    
443
        h264_loop_filter_luma
444

    
445
        sub             r0,  r0,  r1, lsl #1
446
        vst1.64         {d8, d9},  [r0,:128], r1
447
        vst1.64         {d16,d17}, [r0,:128], r1
448
        vst1.64         {d0, d1},  [r0,:128], r1
449
        vst1.64         {d10,d11}, [r0,:128]
450

    
451
        align_pop_regs
452
        bx              lr
453
        .endfunc
454

    
455
function ff_h264_h_loop_filter_luma_neon, export=1
456
        h264_loop_filter_start
457

    
458
        sub             r0,  r0,  #4
459
        vld1.64         {d6},  [r0], r1
460
        vld1.64         {d20}, [r0], r1
461
        vld1.64         {d18}, [r0], r1
462
        vld1.64         {d16}, [r0], r1
463
        vld1.64         {d0},  [r0], r1
464
        vld1.64         {d2},  [r0], r1
465
        vld1.64         {d4},  [r0], r1
466
        vld1.64         {d26}, [r0], r1
467
        vld1.64         {d7},  [r0], r1
468
        vld1.64         {d21}, [r0], r1
469
        vld1.64         {d19}, [r0], r1
470
        vld1.64         {d17}, [r0], r1
471
        vld1.64         {d1},  [r0], r1
472
        vld1.64         {d3},  [r0], r1
473
        vld1.64         {d5},  [r0], r1
474
        vld1.64         {d27}, [r0], r1
475

    
476
        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
477

    
478
        align_push_regs
479
        sub             sp,  sp,  #16
480
        vst1.64         {d4, d5},  [sp,:128]
481
        sub             sp,  sp,  #16
482
        vst1.64         {d20,d21}, [sp,:128]
483

    
484
        h264_loop_filter_luma
485

    
486
        vld1.64         {d20,d21}, [sp,:128]!
487
        vld1.64         {d4, d5},  [sp,:128]!
488

    
489
        transpose_8x8   q3, q10, q4, q8, q0, q5, q2, q13
490

    
491
        sub             r0,  r0,  r1, lsl #4
492
        vst1.64         {d6},  [r0], r1
493
        vst1.64         {d20}, [r0], r1
494
        vst1.64         {d8},  [r0], r1
495
        vst1.64         {d16}, [r0], r1
496
        vst1.64         {d0},  [r0], r1
497
        vst1.64         {d10}, [r0], r1
498
        vst1.64         {d4},  [r0], r1
499
        vst1.64         {d26}, [r0], r1
500
        vst1.64         {d7},  [r0], r1
501
        vst1.64         {d21}, [r0], r1
502
        vst1.64         {d9},  [r0], r1
503
        vst1.64         {d17}, [r0], r1
504
        vst1.64         {d1},  [r0], r1
505
        vst1.64         {d11}, [r0], r1
506
        vst1.64         {d5},  [r0], r1
507
        vst1.64         {d27}, [r0], r1
508

    
509
        align_pop_regs
510
        bx              lr
511
        .endfunc
512

    
513
        .macro h264_loop_filter_chroma
514
        vdup.8          d22, r2         @ alpha
515
        vmovl.u8        q12, d24
516
        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
517
        vmovl.u8        q2,  d0
518
        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
519
        vsubw.u8        q2,  q2,  d16
520
        vsli.16         d24, d24, #8
521
        vshl.i16        q2,  q2,  #2
522
        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
523
        vaddw.u8        q2,  q2,  d18
524
        vclt.u8         d26, d26, d22   @ < alpha
525
        vsubw.u8        q2,  q2,  d2
526
        vdup.8          d22, r3         @ beta
527
        vclt.s8         d25, d24, #0
528
        vrshrn.i16      d4,  q2,  #3
529
        vclt.u8         d28, d28, d22   @ < beta
530
        vbic            d26, d26, d25
531
        vclt.u8         d30, d30, d22   @ < beta
532
        vand            d26, d26, d28
533
        vneg.s8         d25, d24
534
        vand            d26, d26, d30
535
        vmin.s8         d4,  d4,  d24
536
        vmovl.u8        q14, d16
537
        vand            d4,  d4,  d26
538
        vmax.s8         d4,  d4,  d25
539
        vmovl.u8        q11, d0
540
        vaddw.s8        q14, q14, d4
541
        vsubw.s8        q11, q11, d4
542
        vqmovun.s16     d16, q14
543
        vqmovun.s16     d0,  q11
544
        .endm
545

    
546
function ff_h264_v_loop_filter_chroma_neon, export=1
547
        h264_loop_filter_start
548

    
549
        sub             r0,  r0,  r1, lsl #1
550
        vld1.64         {d18}, [r0,:64], r1
551
        vld1.64         {d16}, [r0,:64], r1
552
        vld1.64         {d0},  [r0,:64], r1
553
        vld1.64         {d2},  [r0,:64]
554

    
555
        h264_loop_filter_chroma
556

    
557
        sub             r0,  r0,  r1, lsl #1
558
        vst1.64         {d16}, [r0,:64], r1
559
        vst1.64         {d0},  [r0,:64], r1
560

    
561
        bx              lr
562
        .endfunc
563

    
564
function ff_h264_h_loop_filter_chroma_neon, export=1
565
        h264_loop_filter_start
566

    
567
        sub             r0,  r0,  #2
568
        vld1.32         {d18[0]}, [r0], r1
569
        vld1.32         {d16[0]}, [r0], r1
570
        vld1.32         {d0[0]},  [r0], r1
571
        vld1.32         {d2[0]},  [r0], r1
572
        vld1.32         {d18[1]}, [r0], r1
573
        vld1.32         {d16[1]}, [r0], r1
574
        vld1.32         {d0[1]},  [r0], r1
575
        vld1.32         {d2[1]},  [r0], r1
576

    
577
        vtrn.16         d18, d0
578
        vtrn.16         d16, d2
579
        vtrn.8          d18, d16
580
        vtrn.8          d0,  d2
581

    
582
        h264_loop_filter_chroma
583

    
584
        vtrn.16         d18, d0
585
        vtrn.16         d16, d2
586
        vtrn.8          d18, d16
587
        vtrn.8          d0,  d2
588

    
589
        sub             r0,  r0,  r1, lsl #3
590
        vst1.32         {d18[0]}, [r0], r1
591
        vst1.32         {d16[0]}, [r0], r1
592
        vst1.32         {d0[0]},  [r0], r1
593
        vst1.32         {d2[0]},  [r0], r1
594
        vst1.32         {d18[1]}, [r0], r1
595
        vst1.32         {d16[1]}, [r0], r1
596
        vst1.32         {d0[1]},  [r0], r1
597
        vst1.32         {d2[1]},  [r0], r1
598

    
599
        bx              lr
600
        .endfunc
601

    
602
        /* H.264 qpel MC */
603

    
604
        .macro  lowpass_const r
605
        movw            \r,  #5
606
        movt            \r,  #20
607
        vmov.32         d6[0], \r
608
        .endm
609

    
610
        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
611
.if \narrow
612
        t0 .req q0
613
        t1 .req q8
614
.else
615
        t0 .req \d0
616
        t1 .req \d1
617
.endif
618
        vext.8          d2,  \r0, \r1, #2
619
        vext.8          d3,  \r0, \r1, #3
620
        vaddl.u8        q1,  d2,  d3
621
        vext.8          d4,  \r0, \r1, #1
622
        vext.8          d5,  \r0, \r1, #4
623
        vaddl.u8        q2,  d4,  d5
624
        vext.8          d30, \r0, \r1, #5
625
        vaddl.u8        t0,  \r0, d30
626
        vext.8          d18, \r2, \r3, #2
627
        vmla.i16        t0,  q1,  d6[1]
628
        vext.8          d19, \r2, \r3, #3
629
        vaddl.u8        q9,  d18, d19
630
        vext.8          d20, \r2, \r3, #1
631
        vmls.i16        t0,  q2,  d6[0]
632
        vext.8          d21, \r2, \r3, #4
633
        vaddl.u8        q10, d20, d21
634
        vext.8          d31, \r2, \r3, #5
635
        vaddl.u8        t1,  \r2, d31
636
        vmla.i16        t1,  q9,  d6[1]
637
        vmls.i16        t1,  q10, d6[0]
638
.if \narrow
639
        vqrshrun.s16    \d0, t0,  #5
640
        vqrshrun.s16    \d1, t1,  #5
641
.endif
642
        .unreq  t0
643
        .unreq  t1
644
        .endm
645

    
646
        .macro  lowpass_8_1 r0, r1, d0, narrow=1
647
.if \narrow
648
        t0 .req q0
649
.else
650
        t0 .req \d0
651
.endif
652
        vext.8          d2,  \r0, \r1, #2
653
        vext.8          d3,  \r0, \r1, #3
654
        vaddl.u8        q1,  d2,  d3
655
        vext.8          d4,  \r0, \r1, #1
656
        vext.8          d5,  \r0, \r1, #4
657
        vaddl.u8        q2,  d4,  d5
658
        vext.8          d30, \r0, \r1, #5
659
        vaddl.u8        t0,  \r0, d30
660
        vmla.i16        t0,  q1,  d6[1]
661
        vmls.i16        t0,  q2,  d6[0]
662
.if \narrow
663
        vqrshrun.s16    \d0, t0,  #5
664
.endif
665
        .unreq  t0
666
        .endm
667

    
668
        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
669
        vext.16         q1,  \r0, \r1, #2
670
        vext.16         q0,  \r0, \r1, #3
671
        vaddl.s16       q9,  d2,  d0
672
        vext.16         q2,  \r0, \r1, #1
673
        vaddl.s16       q1,  d3,  d1
674
        vext.16         q3,  \r0, \r1, #4
675
        vaddl.s16       q10, d4,  d6
676
        vext.16         \r1, \r0, \r1, #5
677
        vaddl.s16       q2,  d5,  d7
678
        vaddl.s16       q0,  \h0, \h1
679
        vaddl.s16       q8,  \l0, \l1
680

    
681
        vshl.i32        q3,  q9,  #4
682
        vshl.i32        q9,  q9,  #2
683
        vshl.i32        q15, q10, #2
684
        vadd.i32        q9,  q9,  q3
685
        vadd.i32        q10, q10, q15
686

    
687
        vshl.i32        q3,  q1,  #4
688
        vshl.i32        q1,  q1,  #2
689
        vshl.i32        q15, q2,  #2
690
        vadd.i32        q1,  q1,  q3
691
        vadd.i32        q2,  q2,  q15
692

    
693
        vadd.i32        q9,  q9,  q8
694
        vsub.i32        q9,  q9,  q10
695

    
696
        vadd.i32        q1,  q1,  q0
697
        vsub.i32        q1,  q1,  q2
698

    
699
        vrshrn.s32      d18, q9,  #10
700
        vrshrn.s32      d19, q1,  #10
701

    
702
        vqmovun.s16     \d,  q9
703
        .endm
704

    
705
function put_h264_qpel16_h_lowpass_neon_packed
706
        mov             r4,  lr
707
        mov             ip,  #16
708
        mov             r3,  #8
709
        bl              put_h264_qpel8_h_lowpass_neon
710
        sub             r1,  r1,  r2, lsl #4
711
        add             r1,  r1,  #8
712
        mov             ip,  #16
713
        mov             lr,  r4
714
        b               put_h264_qpel8_h_lowpass_neon
715
        .endfunc
716

    
717
function put_h264_qpel16_h_lowpass_neon
718
        push            {lr}
719
        mov             ip,  #16
720
        bl              put_h264_qpel8_h_lowpass_neon
721
        sub             r0,  r0,  r3, lsl #4
722
        sub             r1,  r1,  r2, lsl #4
723
        add             r0,  r0,  #8
724
        add             r1,  r1,  #8
725
        mov             ip,  #16
726
        pop             {lr}
727
        .endfunc
728

    
729
function put_h264_qpel8_h_lowpass_neon
730
1:      vld1.64         {d0, d1},  [r1], r2
731
        vld1.64         {d16,d17}, [r1], r2
732
        subs            ip,  ip,  #2
733
        lowpass_8       d0,  d1,  d16, d17, d0,  d16
734
        vst1.64         {d0},     [r0,:64], r3
735
        vst1.64         {d16},    [r0,:64], r3
736
        bne             1b
737
        bx              lr
738
        .endfunc
739

    
740
function put_h264_qpel16_h_lowpass_l2_neon
741
        push            {lr}
742
        mov             ip,  #16
743
        bl              put_h264_qpel8_h_lowpass_l2_neon
744
        sub             r0,  r0,  r2, lsl #4
745
        sub             r1,  r1,  r2, lsl #4
746
        sub             r3,  r3,  r2, lsl #4
747
        add             r0,  r0,  #8
748
        add             r1,  r1,  #8
749
        add             r3,  r3,  #8
750
        mov             ip,  #16
751
        pop             {lr}
752
        .endfunc
753

    
754
function put_h264_qpel8_h_lowpass_l2_neon
755
1:      vld1.64         {d0, d1},  [r1], r2
756
        vld1.64         {d16,d17}, [r1], r2
757
        vld1.64         {d28},     [r3], r2
758
        vld1.64         {d29},     [r3], r2
759
        subs            ip,  ip,  #2
760
        lowpass_8       d0,  d1,  d16, d17, d0,  d1
761
        vrhadd.u8       q0,  q0,  q14
762
        vst1.64         {d0},      [r0,:64], r2
763
        vst1.64         {d1},      [r0,:64], r2
764
        bne             1b
765
        bx              lr
766
        .endfunc
767

    
768
function put_h264_qpel16_v_lowpass_neon_packed
769
        mov             r4,  lr
770
        mov             r2,  #8
771
        bl              put_h264_qpel8_v_lowpass_neon
772
        sub             r1,  r1,  r3, lsl #2
773
        bl              put_h264_qpel8_v_lowpass_neon
774
        sub             r1,  r1,  r3, lsl #4
775
        sub             r1,  r1,  r3, lsl #2
776
        add             r1,  r1,  #8
777
        bl              put_h264_qpel8_v_lowpass_neon
778
        sub             r1,  r1,  r3, lsl #2
779
        mov             lr,  r4
780
        b               put_h264_qpel8_v_lowpass_neon
781
        .endfunc
782

    
783
function put_h264_qpel16_v_lowpass_neon
784
        mov             r4,  lr
785
        bl              put_h264_qpel8_v_lowpass_neon
786
        sub             r1,  r1,  r3, lsl #2
787
        bl              put_h264_qpel8_v_lowpass_neon
788
        sub             r0,  r0,  r2, lsl #4
789
        add             r0,  r0,  #8
790
        sub             r1,  r1,  r3, lsl #4
791
        sub             r1,  r1,  r3, lsl #2
792
        add             r1,  r1,  #8
793
        bl              put_h264_qpel8_v_lowpass_neon
794
        sub             r1,  r1,  r3, lsl #2
795
        mov             lr,  r4
796
        .endfunc
797

    
798
function put_h264_qpel8_v_lowpass_neon
799
        vld1.64         {d8},  [r1], r3
800
        vld1.64         {d10}, [r1], r3
801
        vld1.64         {d12}, [r1], r3
802
        vld1.64         {d14}, [r1], r3
803
        vld1.64         {d22}, [r1], r3
804
        vld1.64         {d24}, [r1], r3
805
        vld1.64         {d26}, [r1], r3
806
        vld1.64         {d28}, [r1], r3
807
        vld1.64         {d9},  [r1], r3
808
        vld1.64         {d11}, [r1], r3
809
        vld1.64         {d13}, [r1], r3
810
        vld1.64         {d15}, [r1], r3
811
        vld1.64         {d23}, [r1]
812

    
813
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
814
        lowpass_8       d8,  d9,  d10, d11, d8,  d10
815
        lowpass_8       d12, d13, d14, d15, d12, d14
816
        lowpass_8       d22, d23, d24, d25, d22, d24
817
        lowpass_8       d26, d27, d28, d29, d26, d28
818
        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
819

    
820
        vst1.64         {d8},  [r0,:64], r2
821
        vst1.64         {d10}, [r0,:64], r2
822
        vst1.64         {d12}, [r0,:64], r2
823
        vst1.64         {d14}, [r0,:64], r2
824
        vst1.64         {d22}, [r0,:64], r2
825
        vst1.64         {d24}, [r0,:64], r2
826
        vst1.64         {d26}, [r0,:64], r2
827
        vst1.64         {d28}, [r0,:64], r2
828

    
829
        bx              lr
830
        .endfunc
831

    
832
function put_h264_qpel16_v_lowpass_l2_neon
833
        mov             r4,  lr
834
        bl              put_h264_qpel8_v_lowpass_l2_neon
835
        sub             r1,  r1,  r3, lsl #2
836
        bl              put_h264_qpel8_v_lowpass_l2_neon
837
        sub             r0,  r0,  r3, lsl #4
838
        sub             ip,  ip,  r2, lsl #4
839
        add             r0,  r0,  #8
840
        add             ip,  ip,  #8
841
        sub             r1,  r1,  r3, lsl #4
842
        sub             r1,  r1,  r3, lsl #2
843
        add             r1,  r1,  #8
844
        bl              put_h264_qpel8_v_lowpass_l2_neon
845
        sub             r1,  r1,  r3, lsl #2
846
        mov             lr,  r4
847
        .endfunc
848

    
849
function put_h264_qpel8_v_lowpass_l2_neon
850
        vld1.64         {d8},  [r1], r3
851
        vld1.64         {d10}, [r1], r3
852
        vld1.64         {d12}, [r1], r3
853
        vld1.64         {d14}, [r1], r3
854
        vld1.64         {d22}, [r1], r3
855
        vld1.64         {d24}, [r1], r3
856
        vld1.64         {d26}, [r1], r3
857
        vld1.64         {d28}, [r1], r3
858
        vld1.64         {d9},  [r1], r3
859
        vld1.64         {d11}, [r1], r3
860
        vld1.64         {d13}, [r1], r3
861
        vld1.64         {d15}, [r1], r3
862
        vld1.64         {d23}, [r1]
863

    
864
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
865
        lowpass_8       d8,  d9,  d10, d11, d8,  d9
866
        lowpass_8       d12, d13, d14, d15, d12, d13
867
        lowpass_8       d22, d23, d24, d25, d22, d23
868
        lowpass_8       d26, d27, d28, d29, d26, d27
869
        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
870

    
871
        vld1.64         {d0},  [ip], r2
872
        vld1.64         {d1},  [ip], r2
873
        vld1.64         {d2},  [ip], r2
874
        vld1.64         {d3},  [ip], r2
875
        vld1.64         {d4},  [ip], r2
876
        vrhadd.u8       q0,  q0,  q4
877
        vld1.64         {d5},  [ip], r2
878
        vrhadd.u8       q1,  q1,  q6
879
        vld1.64         {d10}, [ip], r2
880
        vrhadd.u8       q2,  q2,  q11
881
        vld1.64         {d11}, [ip], r2
882

    
883
        vst1.64         {d0},  [r0,:64], r3
884
        vst1.64         {d1},  [r0,:64], r3
885
        vrhadd.u8       q5,  q5,  q13
886
        vst1.64         {d2},  [r0,:64], r3
887
        vst1.64         {d3},  [r0,:64], r3
888
        vst1.64         {d4},  [r0,:64], r3
889
        vst1.64         {d5},  [r0,:64], r3
890
        vst1.64         {d10}, [r0,:64], r3
891
        vst1.64         {d11}, [r0,:64], r3
892

    
893
        bx              lr
894
        .endfunc
895

    
896
function put_h264_qpel8_hv_lowpass_neon_top
897
        lowpass_const   ip
898
        mov             ip,  #12
899
1:      vld1.64         {d0, d1},  [r1], r3
900
        vld1.64         {d16,d17}, [r1], r3
901
        subs            ip,  ip,  #2
902
        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
903
        vst1.64         {d22-d25}, [r4,:128]!
904
        bne             1b
905

    
906
        vld1.64         {d0, d1},  [r1]
907
        lowpass_8_1     d0,  d1,  q12, narrow=0
908

    
909
        mov             ip,  #-16
910
        add             r4,  r4,  ip
911
        vld1.64         {d30,d31}, [r4,:128], ip
912
        vld1.64         {d20,d21}, [r4,:128], ip
913
        vld1.64         {d18,d19}, [r4,:128], ip
914
        vld1.64         {d16,d17}, [r4,:128], ip
915
        vld1.64         {d14,d15}, [r4,:128], ip
916
        vld1.64         {d12,d13}, [r4,:128], ip
917
        vld1.64         {d10,d11}, [r4,:128], ip
918
        vld1.64         {d8, d9},  [r4,:128], ip
919
        vld1.64         {d6, d7},  [r4,:128], ip
920
        vld1.64         {d4, d5},  [r4,:128], ip
921
        vld1.64         {d2, d3},  [r4,:128], ip
922
        vld1.64         {d0, d1},  [r4,:128]
923

    
924
        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
925
        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
926

    
927
        swap4           d17, d19, d21, d31, d24, d26, d28, d22
928
        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
929

    
930
        vst1.64         {d30,d31}, [r4,:128]!
931
        vst1.64         {d6, d7},  [r4,:128]!
932
        vst1.64         {d20,d21}, [r4,:128]!
933
        vst1.64         {d4, d5},  [r4,:128]!
934
        vst1.64         {d18,d19}, [r4,:128]!
935
        vst1.64         {d2, d3},  [r4,:128]!
936
        vst1.64         {d16,d17}, [r4,:128]!
937
        vst1.64         {d0, d1},  [r4,:128]
938

    
939
        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
940
        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
941
        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
942
        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
943

    
944
        vld1.64         {d16,d17}, [r4,:128], ip
945
        vld1.64         {d30,d31}, [r4,:128], ip
946
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
947
        vld1.64         {d16,d17}, [r4,:128], ip
948
        vld1.64         {d30,d31}, [r4,:128], ip
949
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
950
        vld1.64         {d16,d17}, [r4,:128], ip
951
        vld1.64         {d30,d31}, [r4,:128], ip
952
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
953
        vld1.64         {d16,d17}, [r4,:128], ip
954
        vld1.64         {d30,d31}, [r4,:128]
955
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
956

    
957
        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
958

    
959
        bx              lr
960
        .endfunc
961

    
962
function put_h264_qpel8_hv_lowpass_neon
963
        mov             r10, lr
964
        bl              put_h264_qpel8_hv_lowpass_neon_top
965
        vst1.64         {d12},     [r0,:64], r2
966
        vst1.64         {d13},     [r0,:64], r2
967
        vst1.64         {d14},     [r0,:64], r2
968
        vst1.64         {d15},     [r0,:64], r2
969
        vst1.64         {d8},      [r0,:64], r2
970
        vst1.64         {d9},      [r0,:64], r2
971
        vst1.64         {d10},     [r0,:64], r2
972
        vst1.64         {d11},     [r0,:64], r2
973

    
974
        mov             lr,  r10
975
        bx              lr
976
        .endfunc
977

    
978
function put_h264_qpel8_hv_lowpass_l2_neon
979
        mov             r10, lr
980
        bl              put_h264_qpel8_hv_lowpass_neon_top
981

    
982
        vld1.64         {d0, d1},  [r2,:128]!
983
        vld1.64         {d2, d3},  [r2,:128]!
984
        vrhadd.u8       q0,  q0,  q6
985
        vld1.64         {d4, d5},  [r2,:128]!
986
        vrhadd.u8       q1,  q1,  q7
987
        vld1.64         {d6, d7},  [r2,:128]!
988
        vrhadd.u8       q2,  q2,  q4
989

    
990
        vst1.64         {d0},      [r0,:64], r3
991
        vrhadd.u8       q3,  q3,  q5
992
        vst1.64         {d1},      [r0,:64], r3
993
        vst1.64         {d2},      [r0,:64], r3
994
        vst1.64         {d3},      [r0,:64], r3
995
        vst1.64         {d4},      [r0,:64], r3
996
        vst1.64         {d5},      [r0,:64], r3
997
        vst1.64         {d6},      [r0,:64], r3
998
        vst1.64         {d7},      [r0,:64], r3
999

    
1000
        mov             lr,  r10
1001
        bx              lr
1002
        .endfunc
1003

    
1004
function put_h264_qpel16_hv_lowpass_neon
1005
        mov             r9,  lr
1006
        bl              put_h264_qpel8_hv_lowpass_neon
1007
        sub             r1,  r1,  r3, lsl #2
1008
        bl              put_h264_qpel8_hv_lowpass_neon
1009
        sub             r1,  r1,  r3, lsl #4
1010
        sub             r1,  r1,  r3, lsl #2
1011
        add             r1,  r1,  #8
1012
        sub             r0,  r0,  r2, lsl #4
1013
        add             r0,  r0,  #8
1014
        bl              put_h264_qpel8_hv_lowpass_neon
1015
        sub             r1,  r1,  r3, lsl #2
1016
        mov             lr,  r9
1017
        b               put_h264_qpel8_hv_lowpass_neon
1018
        .endfunc
1019

    
1020
function put_h264_qpel16_hv_lowpass_l2_neon
1021
        mov             r9,  lr
1022
        sub             r2,  r4,  #256
1023
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1024
        sub             r1,  r1,  r3, lsl #2
1025
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1026
        sub             r1,  r1,  r3, lsl #4
1027
        sub             r1,  r1,  r3, lsl #2
1028
        add             r1,  r1,  #8
1029
        sub             r0,  r0,  r3, lsl #4
1030
        add             r0,  r0,  #8
1031
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1032
        sub             r1,  r1,  r3, lsl #2
1033
        mov             lr,  r9
1034
        b               put_h264_qpel8_hv_lowpass_l2_neon
1035
        .endfunc
1036

    
1037
function ff_put_h264_qpel8_mc10_neon, export=1
1038
        lowpass_const   r3
1039
        mov             r3,  r1
1040
        sub             r1,  r1,  #2
1041
        mov             ip,  #8
1042
        b               put_h264_qpel8_h_lowpass_l2_neon
1043
        .endfunc
1044

    
1045
function ff_put_h264_qpel8_mc20_neon, export=1
1046
        lowpass_const   r3
1047
        sub             r1,  r1,  #2
1048
        mov             r3,  r2
1049
        mov             ip,  #8
1050
        b               put_h264_qpel8_h_lowpass_neon
1051
        .endfunc
1052

    
1053
function ff_put_h264_qpel8_mc30_neon, export=1
1054
        lowpass_const   r3
1055
        add             r3,  r1,  #1
1056
        sub             r1,  r1,  #2
1057
        mov             ip,  #8
1058
        b               put_h264_qpel8_h_lowpass_l2_neon
1059
        .endfunc
1060

    
1061
function ff_put_h264_qpel8_mc01_neon, export=1
1062
        push            {lr}
1063
        mov             ip,  r1
1064
put_h264_qpel8_mc01:
1065
        lowpass_const   r3
1066
        mov             r3,  r2
1067
        sub             r1,  r1,  r2, lsl #1
1068
        vpush           {d8-d15}
1069
        bl              put_h264_qpel8_v_lowpass_l2_neon
1070
        vpop            {d8-d15}
1071
        pop             {pc}
1072
        .endfunc
1073

    
1074
function ff_put_h264_qpel8_mc11_neon, export=1
1075
        push            {r0, r1, r2, lr}
1076
put_h264_qpel8_mc11:
1077
        lowpass_const   r3
1078
        sub             sp,  sp,  #64
1079
        mov             r0,  sp
1080
        sub             r1,  r1,  #2
1081
        mov             r3,  #8
1082
        mov             ip,  #8
1083
        vpush           {d8-d15}
1084
        bl              put_h264_qpel8_h_lowpass_neon
1085
        ldrd            r0,  [sp, #128]
1086
        mov             r3,  r2
1087
        add             ip,  sp,  #64
1088
        sub             r1,  r1,  r2, lsl #1
1089
        mov             r2,  #8
1090
        bl              put_h264_qpel8_v_lowpass_l2_neon
1091
        vpop            {d8-d15}
1092
        add             sp,  sp,  #76
1093
        pop             {pc}
1094
        .endfunc
1095

    
1096
function ff_put_h264_qpel8_mc21_neon, export=1
1097
        push            {r0, r1, r4, r10, r11, lr}
1098
put_h264_qpel8_mc21:
1099
        lowpass_const   r3
1100
        mov             r11, sp
1101
        bic             sp,  sp,  #15
1102
        sub             sp,  sp,  #(8*8+16*12)
1103
        sub             r1,  r1,  #2
1104
        mov             r3,  #8
1105
        mov             r0,  sp
1106
        mov             ip,  #8
1107
        vpush           {d8-d15}
1108
        bl              put_h264_qpel8_h_lowpass_neon
1109
        mov             r4,  r0
1110
        ldrd            r0,  [r11]
1111
        sub             r1,  r1,  r2, lsl #1
1112
        sub             r1,  r1,  #2
1113
        mov             r3,  r2
1114
        sub             r2,  r4,  #64
1115
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1116
        vpop            {d8-d15}
1117
        add             sp,  r11,  #8
1118
        pop             {r4, r10, r11, pc}
1119
        .endfunc
1120

    
1121
function ff_put_h264_qpel8_mc31_neon, export=1
1122
        add             r1,  r1,  #1
1123
        push            {r0, r1, r2, lr}
1124
        sub             r1,  r1,  #1
1125
        b               put_h264_qpel8_mc11
1126
        .endfunc
1127

    
1128
function ff_put_h264_qpel8_mc02_neon, export=1
1129
        push            {lr}
1130
        lowpass_const   r3
1131
        sub             r1,  r1,  r2, lsl #1
1132
        mov             r3,  r2
1133
        vpush           {d8-d15}
1134
        bl              put_h264_qpel8_v_lowpass_neon
1135
        vpop            {d8-d15}
1136
        pop             {pc}
1137
        .endfunc
1138

    
1139
function ff_put_h264_qpel8_mc12_neon, export=1
1140
        push            {r0, r1, r4, r10, r11, lr}
1141
put_h264_qpel8_mc12:
1142
        lowpass_const   r3
1143
        mov             r11, sp
1144
        bic             sp,  sp,  #15
1145
        sub             sp,  sp,  #(8*8+16*12)
1146
        sub             r1,  r1,  r2, lsl #1
1147
        mov             r3,  r2
1148
        mov             r2,  #8
1149
        mov             r0,  sp
1150
        vpush           {d8-d15}
1151
        bl              put_h264_qpel8_v_lowpass_neon
1152
        mov             r4,  r0
1153
        ldrd            r0,  [r11]
1154
        sub             r1,  r1,  r3, lsl #1
1155
        sub             r1,  r1,  #2
1156
        sub             r2,  r4,  #64
1157
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1158
        vpop            {d8-d15}
1159
        add             sp,  r11,  #8
1160
        pop             {r4, r10, r11, pc}
1161
        .endfunc
1162

    
1163
function ff_put_h264_qpel8_mc22_neon, export=1
1164
        push            {r4, r10, r11, lr}
1165
        mov             r11, sp
1166
        bic             sp,  sp,  #15
1167
        sub             r1,  r1,  r2, lsl #1
1168
        sub             r1,  r1,  #2
1169
        mov             r3,  r2
1170
        sub             sp,  sp,  #(16*12)
1171
        mov             r4,  sp
1172
        vpush           {d8-d15}
1173
        bl              put_h264_qpel8_hv_lowpass_neon
1174
        vpop            {d8-d15}
1175
        mov             sp,  r11
1176
        pop             {r4, r10, r11, pc}
1177
        .endfunc
1178

    
1179
function ff_put_h264_qpel8_mc32_neon, export=1
1180
        push            {r0, r1, r4, r10, r11, lr}
1181
        add             r1,  r1,  #1
1182
        b               put_h264_qpel8_mc12
1183
        .endfunc
1184

    
1185
function ff_put_h264_qpel8_mc03_neon, export=1
1186
        push            {lr}
1187
        add             ip,  r1,  r2
1188
        b               put_h264_qpel8_mc01
1189
        .endfunc
1190

    
1191
function ff_put_h264_qpel8_mc13_neon, export=1
1192
        push            {r0, r1, r2, lr}
1193
        add             r1,  r1,  r2
1194
        b               put_h264_qpel8_mc11
1195
        .endfunc
1196

    
1197
function ff_put_h264_qpel8_mc23_neon, export=1
1198
        push            {r0, r1, r4, r10, r11, lr}
1199
        add             r1,  r1,  r2
1200
        b               put_h264_qpel8_mc21
1201
        .endfunc
1202

    
1203
function ff_put_h264_qpel8_mc33_neon, export=1
1204
        add             r1,  r1,  #1
1205
        push            {r0, r1, r2, lr}
1206
        add             r1,  r1,  r2
1207
        sub             r1,  r1,  #1
1208
        b               put_h264_qpel8_mc11
1209
        .endfunc
1210

    
1211
function ff_put_h264_qpel16_mc10_neon, export=1
1212
        lowpass_const   r3
1213
        mov             r3,  r1
1214
        sub             r1,  r1,  #2
1215
        b               put_h264_qpel16_h_lowpass_l2_neon
1216
        .endfunc
1217

    
1218
function ff_put_h264_qpel16_mc20_neon, export=1
1219
        lowpass_const   r3
1220
        sub             r1,  r1,  #2
1221
        mov             r3,  r2
1222
        b               put_h264_qpel16_h_lowpass_neon
1223
        .endfunc
1224

    
1225
function ff_put_h264_qpel16_mc30_neon, export=1
1226
        lowpass_const   r3
1227
        add             r3,  r1,  #1
1228
        sub             r1,  r1,  #2
1229
        b               put_h264_qpel16_h_lowpass_l2_neon
1230
        .endfunc
1231

    
1232
function ff_put_h264_qpel16_mc01_neon, export=1
1233
        push            {r4, lr}
1234
        mov             ip,  r1
1235
put_h264_qpel16_mc01:
1236
        lowpass_const   r3
1237
        mov             r3,  r2
1238
        sub             r1,  r1,  r2, lsl #1
1239
        vpush           {d8-d15}
1240
        bl              put_h264_qpel16_v_lowpass_l2_neon
1241
        vpop            {d8-d15}
1242
        pop             {r4, pc}
1243
        .endfunc
1244

    
1245
function ff_put_h264_qpel16_mc11_neon, export=1
1246
        push            {r0, r1, r4, lr}
1247
put_h264_qpel16_mc11:
1248
        lowpass_const   r3
1249
        sub             sp,  sp,  #256
1250
        mov             r0,  sp
1251
        sub             r1,  r1,  #2
1252
        mov             r3,  #16
1253
        vpush           {d8-d15}
1254
        bl              put_h264_qpel16_h_lowpass_neon
1255
        add             r0,  sp,  #256
1256
        ldrd            r0,  [r0, #64]
1257
        mov             r3,  r2
1258
        add             ip,  sp,  #64
1259
        sub             r1,  r1,  r2, lsl #1
1260
        mov             r2,  #16
1261
        bl              put_h264_qpel16_v_lowpass_l2_neon
1262
        vpop            {d8-d15}
1263
        add             sp,  sp,  #(256+8)
1264
        pop             {r4, pc}
1265
        .endfunc
1266

    
1267
function ff_put_h264_qpel16_mc21_neon, export=1
1268
        push            {r0, r1, r4-r5, r9-r11, lr}
1269
put_h264_qpel16_mc21:
1270
        lowpass_const   r3
1271
        mov             r11, sp
1272
        bic             sp,  sp,  #15
1273
        sub             sp,  sp,  #(16*16+16*12)
1274
        sub             r1,  r1,  #2
1275
        mov             r0,  sp
1276
        vpush           {d8-d15}
1277
        bl              put_h264_qpel16_h_lowpass_neon_packed
1278
        mov             r4,  r0
1279
        ldrd            r0,  [r11]
1280
        sub             r1,  r1,  r2, lsl #1
1281
        sub             r1,  r1,  #2
1282
        mov             r3,  r2
1283
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1284
        vpop            {d8-d15}
1285
        add             sp,  r11,  #8
1286
        pop             {r4-r5, r9-r11, pc}
1287
        .endfunc
1288

    
1289
function ff_put_h264_qpel16_mc31_neon, export=1
1290
        add             r1,  r1,  #1
1291
        push            {r0, r1, r4, lr}
1292
        sub             r1,  r1,  #1
1293
        b               put_h264_qpel16_mc11
1294
        .endfunc
1295

    
1296
function ff_put_h264_qpel16_mc02_neon, export=1
1297
        push            {r4, lr}
1298
        lowpass_const   r3
1299
        sub             r1,  r1,  r2, lsl #1
1300
        mov             r3,  r2
1301
        vpush           {d8-d15}
1302
        bl              put_h264_qpel16_v_lowpass_neon
1303
        vpop            {d8-d15}
1304
        pop             {r4, pc}
1305
        .endfunc
1306

    
1307
function ff_put_h264_qpel16_mc12_neon, export=1
1308
        push            {r0, r1, r4-r5, r9-r11, lr}
1309
put_h264_qpel16_mc12:
1310
        lowpass_const   r3
1311
        mov             r11, sp
1312
        bic             sp,  sp,  #15
1313
        sub             sp,  sp,  #(16*16+16*12)
1314
        sub             r1,  r1,  r2, lsl #1
1315
        mov             r0,  sp
1316
        mov             r3,  r2
1317
        vpush           {d8-d15}
1318
        bl              put_h264_qpel16_v_lowpass_neon_packed
1319
        mov             r4,  r0
1320
        ldrd            r0,  [r11]
1321
        sub             r1,  r1,  r3, lsl #1
1322
        sub             r1,  r1,  #2
1323
        mov             r2,  r3
1324
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1325
        vpop            {d8-d15}
1326
        add             sp,  r11,  #8
1327
        pop             {r4-r5, r9-r11, pc}
1328
        .endfunc
1329

    
1330
function ff_put_h264_qpel16_mc22_neon, export=1
1331
        push            {r4, r9-r11, lr}
1332
        lowpass_const   r3
1333
        mov             r11, sp
1334
        bic             sp,  sp,  #15
1335
        sub             r1,  r1,  r2, lsl #1
1336
        sub             r1,  r1,  #2
1337
        mov             r3,  r2
1338
        sub             sp,  sp,  #(16*12)
1339
        mov             r4,  sp
1340
        vpush           {d8-d15}
1341
        bl              put_h264_qpel16_hv_lowpass_neon
1342
        vpop            {d8-d15}
1343
        mov             sp,  r11
1344
        pop             {r4, r9-r11, pc}
1345
        .endfunc
1346

    
1347
function ff_put_h264_qpel16_mc32_neon, export=1
1348
        push            {r0, r1, r4-r5, r9-r11, lr}
1349
        add             r1,  r1,  #1
1350
        b               put_h264_qpel16_mc12
1351
        .endfunc
1352

    
1353
function ff_put_h264_qpel16_mc03_neon, export=1
1354
        push            {r4, lr}
1355
        add             ip,  r1,  r2
1356
        b               put_h264_qpel16_mc01
1357
        .endfunc
1358

    
1359
function ff_put_h264_qpel16_mc13_neon, export=1
1360
        push            {r0, r1, r4, lr}
1361
        add             r1,  r1,  r2
1362
        b               put_h264_qpel16_mc11
1363
        .endfunc
1364

    
1365
function ff_put_h264_qpel16_mc23_neon, export=1
1366
        push            {r0, r1, r4-r5, r9-r11, lr}
1367
        add             r1,  r1,  r2
1368
        b               put_h264_qpel16_mc21
1369
        .endfunc
1370

    
1371
function ff_put_h264_qpel16_mc33_neon, export=1
1372
        add             r1,  r1,  #1
1373
        push            {r0, r1, r4, lr}
1374
        add             r1,  r1,  r2
1375
        sub             r1,  r1,  #1
1376
        b               put_h264_qpel16_mc11
1377
        .endfunc