Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264dsp_neon.S @ 77c45373

History | View | Annotate | Download (45.5 KB)

1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        .fpu neon
24

    
25
        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
26
        vtrn.32         \r0, \r4
27
        vtrn.32         \r1, \r5
28
        vtrn.32         \r2, \r6
29
        vtrn.32         \r3, \r7
30
        vtrn.16         \r0, \r2
31
        vtrn.16         \r1, \r3
32
        vtrn.16         \r4, \r6
33
        vtrn.16         \r5, \r7
34
        vtrn.8          \r0, \r1
35
        vtrn.8          \r2, \r3
36
        vtrn.8          \r4, \r5
37
        vtrn.8          \r6, \r7
38
        .endm
39

    
40
        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
41
        vswp            \r0, \r4
42
        vswp            \r1, \r5
43
        vswp            \r2, \r6
44
        vswp            \r3, \r7
45
        .endm
46

    
47
        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
48
        vtrn.32         \r0, \r2
49
        vtrn.32         \r1, \r3
50
        vtrn.32         \r4, \r6
51
        vtrn.32         \r5, \r7
52
        vtrn.16         \r0, \r1
53
        vtrn.16         \r2, \r3
54
        vtrn.16         \r4, \r5
55
        vtrn.16         \r6, \r7
56
        .endm
57

    
58
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
59
        .macro  h264_chroma_mc8 type
60
function ff_\type\()_h264_chroma_mc8_neon, export=1
61
        push            {r4-r7, lr}
62
        ldrd            r4,  [sp, #20]
63
.ifc \type,avg
64
        mov             lr,  r0
65
.endif
66
        pld             [r1]
67
        pld             [r1, r2]
68

    
69
        muls            r7,  r4,  r5
70
        rsb             r6,  r7,  r5,  lsl #3
71
        rsb             ip,  r7,  r4,  lsl #3
72
        sub             r4,  r7,  r4,  lsl #3
73
        sub             r4,  r4,  r5,  lsl #3
74
        add             r4,  r4,  #64
75

    
76
        beq             2f
77

    
78
        add             r5,  r1,  r2
79

    
80
        vdup.8          d0,  r4
81
        lsl             r4,  r2,  #1
82
        vdup.8          d1,  ip
83
        vld1.64         {d4, d5}, [r1], r4
84
        vdup.8          d2,  r6
85
        vld1.64         {d6, d7}, [r5], r4
86
        vdup.8          d3,  r7
87

    
88
        vext.8          d5,  d4,  d5,  #1
89
        vext.8          d7,  d6,  d7,  #1
90

    
91
1:      pld             [r5]
92
        vmull.u8        q8,  d4,  d0
93
        vmlal.u8        q8,  d5,  d1
94
        vld1.64         {d4, d5}, [r1], r4
95
        vmlal.u8        q8,  d6,  d2
96
        vext.8          d5,  d4,  d5,  #1
97
        vmlal.u8        q8,  d7,  d3
98
        vmull.u8        q9,  d6,  d0
99
        subs            r3,  r3,  #2
100
        vmlal.u8        q9,  d7,  d1
101
        vmlal.u8        q9,  d4,  d2
102
        vmlal.u8        q9,  d5,  d3
103
        vrshrn.u16      d16, q8,  #6
104
        vld1.64         {d6, d7}, [r5], r4
105
        pld             [r1]
106
        vrshrn.u16      d17, q9,  #6
107
.ifc \type,avg
108
        vld1.64         {d20}, [lr,:64], r2
109
        vld1.64         {d21}, [lr,:64], r2
110
        vrhadd.u8       q8,  q8,  q10
111
.endif
112
        vext.8          d7,  d6,  d7,  #1
113
        vst1.64         {d16}, [r0,:64], r2
114
        vst1.64         {d17}, [r0,:64], r2
115
        bgt             1b
116

    
117
        pop             {r4-r7, pc}
118

    
119
2:      tst             r6,  r6
120
        add             ip,  ip,  r6
121
        vdup.8          d0,  r4
122
        vdup.8          d1,  ip
123

    
124
        beq             4f
125

    
126
        add             r5,  r1,  r2
127
        lsl             r4,  r2,  #1
128
        vld1.64         {d4}, [r1], r4
129
        vld1.64         {d6}, [r5], r4
130

    
131
3:      pld             [r5]
132
        vmull.u8        q8,  d4,  d0
133
        vmlal.u8        q8,  d6,  d1
134
        vld1.64         {d4}, [r1], r4
135
        vmull.u8        q9,  d6,  d0
136
        vmlal.u8        q9,  d4,  d1
137
        vld1.64         {d6}, [r5], r4
138
        vrshrn.u16      d16, q8,  #6
139
        vrshrn.u16      d17, q9,  #6
140
.ifc \type,avg
141
        vld1.64         {d20}, [lr,:64], r2
142
        vld1.64         {d21}, [lr,:64], r2
143
        vrhadd.u8       q8,  q8,  q10
144
.endif
145
        subs            r3,  r3,  #2
146
        pld             [r1]
147
        vst1.64         {d16}, [r0,:64], r2
148
        vst1.64         {d17}, [r0,:64], r2
149
        bgt             3b
150

    
151
        pop             {r4-r7, pc}
152

    
153
4:      vld1.64         {d4, d5}, [r1], r2
154
        vld1.64         {d6, d7}, [r1], r2
155
        vext.8          d5,  d4,  d5,  #1
156
        vext.8          d7,  d6,  d7,  #1
157

    
158
5:      pld             [r1]
159
        subs            r3,  r3,  #2
160
        vmull.u8        q8,  d4,  d0
161
        vmlal.u8        q8,  d5,  d1
162
        vld1.64         {d4, d5}, [r1], r2
163
        vmull.u8        q9,  d6,  d0
164
        vmlal.u8        q9,  d7,  d1
165
        pld             [r1]
166
        vext.8          d5,  d4,  d5,  #1
167
        vrshrn.u16      d16, q8,  #6
168
        vrshrn.u16      d17, q9,  #6
169
.ifc \type,avg
170
        vld1.64         {d20}, [lr,:64], r2
171
        vld1.64         {d21}, [lr,:64], r2
172
        vrhadd.u8       q8,  q8,  q10
173
.endif
174
        vld1.64         {d6, d7}, [r1], r2
175
        vext.8          d7,  d6,  d7,  #1
176
        vst1.64         {d16}, [r0,:64], r2
177
        vst1.64         {d17}, [r0,:64], r2
178
        bgt             5b
179

    
180
        pop             {r4-r7, pc}
181
        .endfunc
182
        .endm
183

    
184
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
185
        .macro  h264_chroma_mc4 type
186
function ff_\type\()_h264_chroma_mc4_neon, export=1
187
        push            {r4-r7, lr}
188
        ldrd            r4,  [sp, #20]
189
.ifc \type,avg
190
        mov             lr,  r0
191
.endif
192
        pld             [r1]
193
        pld             [r1, r2]
194

    
195
        muls            r7,  r4,  r5
196
        rsb             r6,  r7,  r5,  lsl #3
197
        rsb             ip,  r7,  r4,  lsl #3
198
        sub             r4,  r7,  r4,  lsl #3
199
        sub             r4,  r4,  r5,  lsl #3
200
        add             r4,  r4,  #64
201

    
202
        beq             2f
203

    
204
        add             r5,  r1,  r2
205

    
206
        vdup.8          d0,  r4
207
        lsl             r4,  r2,  #1
208
        vdup.8          d1,  ip
209
        vld1.64         {d4},     [r1], r4
210
        vdup.8          d2,  r6
211
        vld1.64         {d6},     [r5], r4
212
        vdup.8          d3,  r7
213

    
214
        vext.8          d5,  d4,  d5,  #1
215
        vext.8          d7,  d6,  d7,  #1
216
        vtrn.32         d4,  d5
217
        vtrn.32         d6,  d7
218

    
219
        vtrn.32         d0,  d1
220
        vtrn.32         d2,  d3
221

    
222
1:      pld             [r5]
223
        vmull.u8        q8,  d4,  d0
224
        vmlal.u8        q8,  d6,  d2
225
        vld1.64         {d4},     [r1], r4
226
        vext.8          d5,  d4,  d5,  #1
227
        vtrn.32         d4,  d5
228
        vmull.u8        q9,  d6,  d0
229
        vmlal.u8        q9,  d4,  d2
230
        vld1.64         {d6},     [r5], r4
231
        vadd.i16        d16, d16, d17
232
        vadd.i16        d17, d18, d19
233
        vrshrn.u16      d16, q8,  #6
234
        subs            r3,  r3,  #2
235
        pld             [r1]
236
.ifc \type,avg
237
        vld1.32         {d20[0]}, [lr,:32], r2
238
        vld1.32         {d20[1]}, [lr,:32], r2
239
        vrhadd.u8       d16, d16, d20
240
.endif
241
        vext.8          d7,  d6,  d7,  #1
242
        vtrn.32         d6,  d7
243
        vst1.32         {d16[0]}, [r0,:32], r2
244
        vst1.32         {d16[1]}, [r0,:32], r2
245
        bgt             1b
246

    
247
        pop             {r4-r7, pc}
248

    
249
2:      tst             r6,  r6
250
        add             ip,  ip,  r6
251
        vdup.8          d0,  r4
252
        vdup.8          d1,  ip
253
        vtrn.32         d0,  d1
254

    
255
        beq             4f
256

    
257
        vext.32         d1,  d0,  d1,  #1
258
        add             r5,  r1,  r2
259
        lsl             r4,  r2,  #1
260
        vld1.32         {d4[0]},  [r1], r4
261
        vld1.32         {d4[1]},  [r5], r4
262

    
263
3:      pld             [r5]
264
        vmull.u8        q8,  d4,  d0
265
        vld1.32         {d4[0]},  [r1], r4
266
        vmull.u8        q9,  d4,  d1
267
        vld1.32         {d4[1]},  [r5], r4
268
        vadd.i16        d16, d16, d17
269
        vadd.i16        d17, d18, d19
270
        vrshrn.u16      d16, q8,  #6
271
.ifc \type,avg
272
        vld1.32         {d20[0]}, [lr,:32], r2
273
        vld1.32         {d20[1]}, [lr,:32], r2
274
        vrhadd.u8       d16, d16, d20
275
.endif
276
        subs            r3,  r3,  #2
277
        pld             [r1]
278
        vst1.32         {d16[0]}, [r0,:32], r2
279
        vst1.32         {d16[1]}, [r0,:32], r2
280
        bgt             3b
281

    
282
        pop             {r4-r7, pc}
283

    
284
4:      vld1.64         {d4},     [r1], r2
285
        vld1.64         {d6},     [r1], r2
286
        vext.8          d5,  d4,  d5,  #1
287
        vext.8          d7,  d6,  d7,  #1
288
        vtrn.32         d4,  d5
289
        vtrn.32         d6,  d7
290

    
291
5:      vmull.u8        q8,  d4,  d0
292
        vmull.u8        q9,  d6,  d0
293
        subs            r3,  r3,  #2
294
        vld1.64         {d4},     [r1], r2
295
        vext.8          d5,  d4,  d5,  #1
296
        vtrn.32         d4,  d5
297
        vadd.i16        d16, d16, d17
298
        vadd.i16        d17, d18, d19
299
        pld             [r1]
300
        vrshrn.u16      d16, q8,  #6
301
.ifc \type,avg
302
        vld1.32         {d20[0]}, [lr,:32], r2
303
        vld1.32         {d20[1]}, [lr,:32], r2
304
        vrhadd.u8       d16, d16, d20
305
.endif
306
        vld1.64         {d6},     [r1], r2
307
        vext.8          d7,  d6,  d7,  #1
308
        vtrn.32         d6,  d7
309
        pld             [r1]
310
        vst1.32         {d16[0]}, [r0,:32], r2
311
        vst1.32         {d16[1]}, [r0,:32], r2
312
        bgt             5b
313

    
314
        pop             {r4-r7, pc}
315
        .endfunc
316
        .endm
317

    
318
        .text
319
        .align
320

    
321
        h264_chroma_mc8 put
322
        h264_chroma_mc8 avg
323
        h264_chroma_mc4 put
324
        h264_chroma_mc4 avg
325

    
326
        /* H.264 loop filter */
327

    
328
        .macro h264_loop_filter_start
329
        ldr             ip,  [sp]
330
        tst             r2,  r2
331
        ldr             ip,  [ip]
332
        tstne           r3,  r3
333
        vmov.32         d24[0], ip
334
        and             ip,  ip,  ip, lsl #16
335
        bxeq            lr
336
        ands            ip,  ip,  ip, lsl #8
337
        bxlt            lr
338
        .endm
339

    
340
        .macro align_push_regs
341
        and             ip,  sp,  #15
342
        add             ip,  ip,  #32
343
        sub             sp,  sp,  ip
344
        vst1.64         {d12-d15}, [sp,:128]
345
        sub             sp,  sp,  #32
346
        vst1.64         {d8-d11},  [sp,:128]
347
        .endm
348

    
349
        .macro align_pop_regs
350
        vld1.64         {d8-d11},  [sp,:128]!
351
        vld1.64         {d12-d15}, [sp,:128], ip
352
        .endm
353

    
354
        .macro h264_loop_filter_luma
355
        vdup.8          q11, r2         @ alpha
356
        vmovl.u8        q12, d24
357
        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
358
        vmovl.u16       q12, d24
359
        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
360
        vsli.16         q12, q12, #8
361
        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
362
        vsli.32         q12, q12, #16
363
        vclt.u8         q6,  q6,  q11   @ < alpha
364
        vdup.8          q11, r3         @ beta
365
        vclt.s8         q7,  q12, #0
366
        vclt.u8         q14, q14, q11   @ < beta
367
        vclt.u8         q15, q15, q11   @ < beta
368
        vbic            q6,  q6,  q7
369
        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
370
        vand            q6,  q6,  q14
371
        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
372
        vclt.u8         q4,  q4,  q11   @ < beta
373
        vand            q6,  q6,  q15
374
        vclt.u8         q5,  q5,  q11   @ < beta
375
        vand            q4,  q4,  q6
376
        vand            q5,  q5,  q6
377
        vand            q12, q12, q6
378
        vrhadd.u8       q14, q8,  q0
379
        vsub.i8         q6,  q12, q4
380
        vqadd.u8        q7,  q9,  q12
381
        vhadd.u8        q10, q10, q14
382
        vsub.i8         q6,  q6,  q5
383
        vhadd.u8        q14, q2,  q14
384
        vmin.u8         q7,  q7,  q10
385
        vqsub.u8        q11, q9,  q12
386
        vqadd.u8        q2,  q1,  q12
387
        vmax.u8         q7,  q7,  q11
388
        vqsub.u8        q11, q1,  q12
389
        vmin.u8         q14, q2,  q14
390
        vmovl.u8        q2,  d0
391
        vmax.u8         q14, q14, q11
392
        vmovl.u8        q10, d1
393
        vsubw.u8        q2,  q2,  d16
394
        vsubw.u8        q10, q10, d17
395
        vshl.i16        q2,  q2,  #2
396
        vshl.i16        q10, q10, #2
397
        vaddw.u8        q2,  q2,  d18
398
        vaddw.u8        q10, q10, d19
399
        vsubw.u8        q2,  q2,  d2
400
        vsubw.u8        q10, q10, d3
401
        vrshrn.i16      d4,  q2,  #3
402
        vrshrn.i16      d5,  q10, #3
403
        vbsl            q4,  q7,  q9
404
        vbsl            q5,  q14, q1
405
        vneg.s8         q7,  q6
406
        vmovl.u8        q14, d16
407
        vmin.s8         q2,  q2,  q6
408
        vmovl.u8        q6,  d17
409
        vmax.s8         q2,  q2,  q7
410
        vmovl.u8        q11, d0
411
        vmovl.u8        q12, d1
412
        vaddw.s8        q14, q14, d4
413
        vaddw.s8        q6,  q6,  d5
414
        vsubw.s8        q11, q11, d4
415
        vsubw.s8        q12, q12, d5
416
        vqmovun.s16     d16, q14
417
        vqmovun.s16     d17, q6
418
        vqmovun.s16     d0,  q11
419
        vqmovun.s16     d1,  q12
420
        .endm
421

    
422
function ff_h264_v_loop_filter_luma_neon, export=1
423
        h264_loop_filter_start
424

    
425
        vld1.64         {d0, d1},  [r0,:128], r1
426
        vld1.64         {d2, d3},  [r0,:128], r1
427
        vld1.64         {d4, d5},  [r0,:128], r1
428
        sub             r0,  r0,  r1, lsl #2
429
        sub             r0,  r0,  r1, lsl #1
430
        vld1.64         {d20,d21}, [r0,:128], r1
431
        vld1.64         {d18,d19}, [r0,:128], r1
432
        vld1.64         {d16,d17}, [r0,:128], r1
433

    
434
        align_push_regs
435

    
436
        h264_loop_filter_luma
437

    
438
        sub             r0,  r0,  r1, lsl #1
439
        vst1.64         {d8, d9},  [r0,:128], r1
440
        vst1.64         {d16,d17}, [r0,:128], r1
441
        vst1.64         {d0, d1},  [r0,:128], r1
442
        vst1.64         {d10,d11}, [r0,:128]
443

    
444
        align_pop_regs
445
        bx              lr
446
        .endfunc
447

    
448
function ff_h264_h_loop_filter_luma_neon, export=1
449
        h264_loop_filter_start
450

    
451
        sub             r0,  r0,  #4
452
        vld1.64         {d6},  [r0], r1
453
        vld1.64         {d20}, [r0], r1
454
        vld1.64         {d18}, [r0], r1
455
        vld1.64         {d16}, [r0], r1
456
        vld1.64         {d0},  [r0], r1
457
        vld1.64         {d2},  [r0], r1
458
        vld1.64         {d4},  [r0], r1
459
        vld1.64         {d26}, [r0], r1
460
        vld1.64         {d7},  [r0], r1
461
        vld1.64         {d21}, [r0], r1
462
        vld1.64         {d19}, [r0], r1
463
        vld1.64         {d17}, [r0], r1
464
        vld1.64         {d1},  [r0], r1
465
        vld1.64         {d3},  [r0], r1
466
        vld1.64         {d5},  [r0], r1
467
        vld1.64         {d27}, [r0], r1
468

    
469
        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
470

    
471
        align_push_regs
472
        sub             sp,  sp,  #16
473
        vst1.64         {d4, d5},  [sp,:128]
474
        sub             sp,  sp,  #16
475
        vst1.64         {d20,d21}, [sp,:128]
476

    
477
        h264_loop_filter_luma
478

    
479
        vld1.64         {d20,d21}, [sp,:128]!
480
        vld1.64         {d4, d5},  [sp,:128]!
481

    
482
        transpose_8x8   q3, q10, q4, q8, q0, q5, q2, q13
483

    
484
        sub             r0,  r0,  r1, lsl #4
485
        vst1.64         {d6},  [r0], r1
486
        vst1.64         {d20}, [r0], r1
487
        vst1.64         {d8},  [r0], r1
488
        vst1.64         {d16}, [r0], r1
489
        vst1.64         {d0},  [r0], r1
490
        vst1.64         {d10}, [r0], r1
491
        vst1.64         {d4},  [r0], r1
492
        vst1.64         {d26}, [r0], r1
493
        vst1.64         {d7},  [r0], r1
494
        vst1.64         {d21}, [r0], r1
495
        vst1.64         {d9},  [r0], r1
496
        vst1.64         {d17}, [r0], r1
497
        vst1.64         {d1},  [r0], r1
498
        vst1.64         {d11}, [r0], r1
499
        vst1.64         {d5},  [r0], r1
500
        vst1.64         {d27}, [r0], r1
501

    
502
        align_pop_regs
503
        bx              lr
504
        .endfunc
505

    
506
        .macro h264_loop_filter_chroma
507
        vdup.8          d22, r2         @ alpha
508
        vmovl.u8        q12, d24
509
        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
510
        vmovl.u8        q2,  d0
511
        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
512
        vsubw.u8        q2,  q2,  d16
513
        vsli.16         d24, d24, #8
514
        vshl.i16        q2,  q2,  #2
515
        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
516
        vaddw.u8        q2,  q2,  d18
517
        vclt.u8         d26, d26, d22   @ < alpha
518
        vsubw.u8        q2,  q2,  d2
519
        vdup.8          d22, r3         @ beta
520
        vclt.s8         d25, d24, #0
521
        vrshrn.i16      d4,  q2,  #3
522
        vclt.u8         d28, d28, d22   @ < beta
523
        vbic            d26, d26, d25
524
        vclt.u8         d30, d30, d22   @ < beta
525
        vand            d26, d26, d28
526
        vneg.s8         d25, d24
527
        vand            d26, d26, d30
528
        vmin.s8         d4,  d4,  d24
529
        vmovl.u8        q14, d16
530
        vand            d4,  d4,  d26
531
        vmax.s8         d4,  d4,  d25
532
        vmovl.u8        q11, d0
533
        vaddw.s8        q14, q14, d4
534
        vsubw.s8        q11, q11, d4
535
        vqmovun.s16     d16, q14
536
        vqmovun.s16     d0,  q11
537
        .endm
538

    
539
function ff_h264_v_loop_filter_chroma_neon, export=1
540
        h264_loop_filter_start
541

    
542
        sub             r0,  r0,  r1, lsl #1
543
        vld1.64         {d18}, [r0,:64], r1
544
        vld1.64         {d16}, [r0,:64], r1
545
        vld1.64         {d0},  [r0,:64], r1
546
        vld1.64         {d2},  [r0,:64]
547

    
548
        h264_loop_filter_chroma
549

    
550
        sub             r0,  r0,  r1, lsl #1
551
        vst1.64         {d16}, [r0,:64], r1
552
        vst1.64         {d0},  [r0,:64], r1
553

    
554
        bx              lr
555
        .endfunc
556

    
557
function ff_h264_h_loop_filter_chroma_neon, export=1
558
        h264_loop_filter_start
559

    
560
        sub             r0,  r0,  #2
561
        vld1.32         {d18[0]}, [r0], r1
562
        vld1.32         {d16[0]}, [r0], r1
563
        vld1.32         {d0[0]},  [r0], r1
564
        vld1.32         {d2[0]},  [r0], r1
565
        vld1.32         {d18[1]}, [r0], r1
566
        vld1.32         {d16[1]}, [r0], r1
567
        vld1.32         {d0[1]},  [r0], r1
568
        vld1.32         {d2[1]},  [r0], r1
569

    
570
        vtrn.16         d18, d0
571
        vtrn.16         d16, d2
572
        vtrn.8          d18, d16
573
        vtrn.8          d0,  d2
574

    
575
        h264_loop_filter_chroma
576

    
577
        vtrn.16         d18, d0
578
        vtrn.16         d16, d2
579
        vtrn.8          d18, d16
580
        vtrn.8          d0,  d2
581

    
582
        sub             r0,  r0,  r1, lsl #3
583
        vst1.32         {d18[0]}, [r0], r1
584
        vst1.32         {d16[0]}, [r0], r1
585
        vst1.32         {d0[0]},  [r0], r1
586
        vst1.32         {d2[0]},  [r0], r1
587
        vst1.32         {d18[1]}, [r0], r1
588
        vst1.32         {d16[1]}, [r0], r1
589
        vst1.32         {d0[1]},  [r0], r1
590
        vst1.32         {d2[1]},  [r0], r1
591

    
592
        bx              lr
593
        .endfunc
594

    
595
        /* H.264 qpel MC */
596

    
597
        .macro  lowpass_const r
598
        movw            \r,  #5
599
        movt            \r,  #20
600
        vmov.32         d6[0], \r
601
        .endm
602

    
603
        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
604
.if \narrow
605
        t0 .req q0
606
        t1 .req q8
607
.else
608
        t0 .req \d0
609
        t1 .req \d1
610
.endif
611
        vext.8          d2,  \r0, \r1, #2
612
        vext.8          d3,  \r0, \r1, #3
613
        vaddl.u8        q1,  d2,  d3
614
        vext.8          d4,  \r0, \r1, #1
615
        vext.8          d5,  \r0, \r1, #4
616
        vaddl.u8        q2,  d4,  d5
617
        vext.8          d30, \r0, \r1, #5
618
        vaddl.u8        t0,  \r0, d30
619
        vext.8          d18, \r2, \r3, #2
620
        vmla.i16        t0,  q1,  d6[1]
621
        vext.8          d19, \r2, \r3, #3
622
        vaddl.u8        q9,  d18, d19
623
        vext.8          d20, \r2, \r3, #1
624
        vmls.i16        t0,  q2,  d6[0]
625
        vext.8          d21, \r2, \r3, #4
626
        vaddl.u8        q10, d20, d21
627
        vext.8          d31, \r2, \r3, #5
628
        vaddl.u8        t1,  \r2, d31
629
        vmla.i16        t1,  q9,  d6[1]
630
        vmls.i16        t1,  q10, d6[0]
631
.if \narrow
632
        vqrshrun.s16    \d0, t0,  #5
633
        vqrshrun.s16    \d1, t1,  #5
634
.endif
635
        .unreq  t0
636
        .unreq  t1
637
        .endm
638

    
639
        .macro  lowpass_8_1 r0, r1, d0, narrow=1
640
.if \narrow
641
        t0 .req q0
642
.else
643
        t0 .req \d0
644
.endif
645
        vext.8          d2,  \r0, \r1, #2
646
        vext.8          d3,  \r0, \r1, #3
647
        vaddl.u8        q1,  d2,  d3
648
        vext.8          d4,  \r0, \r1, #1
649
        vext.8          d5,  \r0, \r1, #4
650
        vaddl.u8        q2,  d4,  d5
651
        vext.8          d30, \r0, \r1, #5
652
        vaddl.u8        t0,  \r0, d30
653
        vmla.i16        t0,  q1,  d6[1]
654
        vmls.i16        t0,  q2,  d6[0]
655
.if \narrow
656
        vqrshrun.s16    \d0, t0,  #5
657
.endif
658
        .unreq  t0
659
        .endm
660

    
661
        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
662
        vext.16         q1,  \r0, \r1, #2
663
        vext.16         q0,  \r0, \r1, #3
664
        vaddl.s16       q9,  d2,  d0
665
        vext.16         q2,  \r0, \r1, #1
666
        vaddl.s16       q1,  d3,  d1
667
        vext.16         q3,  \r0, \r1, #4
668
        vaddl.s16       q10, d4,  d6
669
        vext.16         \r1, \r0, \r1, #5
670
        vaddl.s16       q2,  d5,  d7
671
        vaddl.s16       q0,  \h0, \h1
672
        vaddl.s16       q8,  \l0, \l1
673

    
674
        vshl.i32        q3,  q9,  #4
675
        vshl.i32        q9,  q9,  #2
676
        vshl.i32        q15, q10, #2
677
        vadd.i32        q9,  q9,  q3
678
        vadd.i32        q10, q10, q15
679

    
680
        vshl.i32        q3,  q1,  #4
681
        vshl.i32        q1,  q1,  #2
682
        vshl.i32        q15, q2,  #2
683
        vadd.i32        q1,  q1,  q3
684
        vadd.i32        q2,  q2,  q15
685

    
686
        vadd.i32        q9,  q9,  q8
687
        vsub.i32        q9,  q9,  q10
688

    
689
        vadd.i32        q1,  q1,  q0
690
        vsub.i32        q1,  q1,  q2
691

    
692
        vrshrn.s32      d18, q9,  #10
693
        vrshrn.s32      d19, q1,  #10
694

    
695
        vqmovun.s16     \d,  q9
696
        .endm
697

    
698
function put_h264_qpel16_h_lowpass_neon_packed
699
        mov             r4,  lr
700
        mov             ip,  #16
701
        mov             r3,  #8
702
        bl              put_h264_qpel8_h_lowpass_neon
703
        sub             r1,  r1,  r2, lsl #4
704
        add             r1,  r1,  #8
705
        mov             ip,  #16
706
        mov             lr,  r4
707
        b               put_h264_qpel8_h_lowpass_neon
708
        .endfunc
709

    
710
function put_h264_qpel16_h_lowpass_neon
711
        push            {lr}
712
        mov             ip,  #16
713
        bl              put_h264_qpel8_h_lowpass_neon
714
        sub             r0,  r0,  r3, lsl #4
715
        sub             r1,  r1,  r2, lsl #4
716
        add             r0,  r0,  #8
717
        add             r1,  r1,  #8
718
        mov             ip,  #16
719
        pop             {lr}
720
        .endfunc
721

    
722
function put_h264_qpel8_h_lowpass_neon
723
1:      vld1.64         {d0, d1},  [r1], r2
724
        vld1.64         {d16,d17}, [r1], r2
725
        subs            ip,  ip,  #2
726
        lowpass_8       d0,  d1,  d16, d17, d0,  d16
727
        vst1.64         {d0},     [r0,:64], r3
728
        vst1.64         {d16},    [r0,:64], r3
729
        bne             1b
730
        bx              lr
731
        .endfunc
732

    
733
function put_h264_qpel16_h_lowpass_l2_neon
734
        push            {lr}
735
        mov             ip,  #16
736
        bl              put_h264_qpel8_h_lowpass_l2_neon
737
        sub             r0,  r0,  r2, lsl #4
738
        sub             r1,  r1,  r2, lsl #4
739
        sub             r3,  r3,  r2, lsl #4
740
        add             r0,  r0,  #8
741
        add             r1,  r1,  #8
742
        add             r3,  r3,  #8
743
        mov             ip,  #16
744
        pop             {lr}
745
        .endfunc
746

    
747
function put_h264_qpel8_h_lowpass_l2_neon
748
1:      vld1.64         {d0, d1},  [r1], r2
749
        vld1.64         {d16,d17}, [r1], r2
750
        vld1.64         {d28},     [r3], r2
751
        vld1.64         {d29},     [r3], r2
752
        subs            ip,  ip,  #2
753
        lowpass_8       d0,  d1,  d16, d17, d0,  d1
754
        vrhadd.u8       q0,  q0,  q14
755
        vst1.64         {d0},      [r0,:64], r2
756
        vst1.64         {d1},      [r0,:64], r2
757
        bne             1b
758
        bx              lr
759
        .endfunc
760

    
761
function put_h264_qpel16_v_lowpass_neon_packed
762
        mov             r4,  lr
763
        mov             r2,  #8
764
        bl              put_h264_qpel8_v_lowpass_neon
765
        sub             r1,  r1,  r3, lsl #2
766
        bl              put_h264_qpel8_v_lowpass_neon
767
        sub             r1,  r1,  r3, lsl #4
768
        sub             r1,  r1,  r3, lsl #2
769
        add             r1,  r1,  #8
770
        bl              put_h264_qpel8_v_lowpass_neon
771
        sub             r1,  r1,  r3, lsl #2
772
        mov             lr,  r4
773
        b               put_h264_qpel8_v_lowpass_neon
774
        .endfunc
775

    
776
function put_h264_qpel16_v_lowpass_neon
777
        mov             r4,  lr
778
        bl              put_h264_qpel8_v_lowpass_neon
779
        sub             r1,  r1,  r3, lsl #2
780
        bl              put_h264_qpel8_v_lowpass_neon
781
        sub             r0,  r0,  r2, lsl #4
782
        add             r0,  r0,  #8
783
        sub             r1,  r1,  r3, lsl #4
784
        sub             r1,  r1,  r3, lsl #2
785
        add             r1,  r1,  #8
786
        bl              put_h264_qpel8_v_lowpass_neon
787
        sub             r1,  r1,  r3, lsl #2
788
        mov             lr,  r4
789
        .endfunc
790

    
791
function put_h264_qpel8_v_lowpass_neon
792
        vld1.64         {d8},  [r1], r3
793
        vld1.64         {d10}, [r1], r3
794
        vld1.64         {d12}, [r1], r3
795
        vld1.64         {d14}, [r1], r3
796
        vld1.64         {d22}, [r1], r3
797
        vld1.64         {d24}, [r1], r3
798
        vld1.64         {d26}, [r1], r3
799
        vld1.64         {d28}, [r1], r3
800
        vld1.64         {d9},  [r1], r3
801
        vld1.64         {d11}, [r1], r3
802
        vld1.64         {d13}, [r1], r3
803
        vld1.64         {d15}, [r1], r3
804
        vld1.64         {d23}, [r1]
805

    
806
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
807
        lowpass_8       d8,  d9,  d10, d11, d8,  d10
808
        lowpass_8       d12, d13, d14, d15, d12, d14
809
        lowpass_8       d22, d23, d24, d25, d22, d24
810
        lowpass_8       d26, d27, d28, d29, d26, d28
811
        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
812

    
813
        vst1.64         {d8},  [r0,:64], r2
814
        vst1.64         {d10}, [r0,:64], r2
815
        vst1.64         {d12}, [r0,:64], r2
816
        vst1.64         {d14}, [r0,:64], r2
817
        vst1.64         {d22}, [r0,:64], r2
818
        vst1.64         {d24}, [r0,:64], r2
819
        vst1.64         {d26}, [r0,:64], r2
820
        vst1.64         {d28}, [r0,:64], r2
821

    
822
        bx              lr
823
        .endfunc
824

    
825
function put_h264_qpel16_v_lowpass_l2_neon
826
        mov             r4,  lr
827
        bl              put_h264_qpel8_v_lowpass_l2_neon
828
        sub             r1,  r1,  r3, lsl #2
829
        bl              put_h264_qpel8_v_lowpass_l2_neon
830
        sub             r0,  r0,  r3, lsl #4
831
        sub             ip,  ip,  r2, lsl #4
832
        add             r0,  r0,  #8
833
        add             ip,  ip,  #8
834
        sub             r1,  r1,  r3, lsl #4
835
        sub             r1,  r1,  r3, lsl #2
836
        add             r1,  r1,  #8
837
        bl              put_h264_qpel8_v_lowpass_l2_neon
838
        sub             r1,  r1,  r3, lsl #2
839
        mov             lr,  r4
840
        .endfunc
841

    
842
function put_h264_qpel8_v_lowpass_l2_neon
843
        vld1.64         {d8},  [r1], r3
844
        vld1.64         {d10}, [r1], r3
845
        vld1.64         {d12}, [r1], r3
846
        vld1.64         {d14}, [r1], r3
847
        vld1.64         {d22}, [r1], r3
848
        vld1.64         {d24}, [r1], r3
849
        vld1.64         {d26}, [r1], r3
850
        vld1.64         {d28}, [r1], r3
851
        vld1.64         {d9},  [r1], r3
852
        vld1.64         {d11}, [r1], r3
853
        vld1.64         {d13}, [r1], r3
854
        vld1.64         {d15}, [r1], r3
855
        vld1.64         {d23}, [r1]
856

    
857
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
858
        lowpass_8       d8,  d9,  d10, d11, d8,  d9
859
        lowpass_8       d12, d13, d14, d15, d12, d13
860
        lowpass_8       d22, d23, d24, d25, d22, d23
861
        lowpass_8       d26, d27, d28, d29, d26, d27
862
        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
863

    
864
        vld1.64         {d0},  [ip], r2
865
        vld1.64         {d1},  [ip], r2
866
        vld1.64         {d2},  [ip], r2
867
        vld1.64         {d3},  [ip], r2
868
        vld1.64         {d4},  [ip], r2
869
        vrhadd.u8       q0,  q0,  q4
870
        vld1.64         {d5},  [ip], r2
871
        vrhadd.u8       q1,  q1,  q6
872
        vld1.64         {d10}, [ip], r2
873
        vrhadd.u8       q2,  q2,  q11
874
        vld1.64         {d11}, [ip], r2
875

    
876
        vst1.64         {d0},  [r0,:64], r3
877
        vst1.64         {d1},  [r0,:64], r3
878
        vrhadd.u8       q5,  q5,  q13
879
        vst1.64         {d2},  [r0,:64], r3
880
        vst1.64         {d3},  [r0,:64], r3
881
        vst1.64         {d4},  [r0,:64], r3
882
        vst1.64         {d5},  [r0,:64], r3
883
        vst1.64         {d10}, [r0,:64], r3
884
        vst1.64         {d11}, [r0,:64], r3
885

    
886
        bx              lr
887
        .endfunc
888

    
889
function put_h264_qpel8_hv_lowpass_neon_top
890
        lowpass_const   ip
891
        mov             ip,  #12
892
1:      vld1.64         {d0, d1},  [r1], r3
893
        vld1.64         {d16,d17}, [r1], r3
894
        subs            ip,  ip,  #2
895
        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
896
        vst1.64         {d22-d25}, [r4,:128]!
897
        bne             1b
898

    
899
        vld1.64         {d0, d1},  [r1]
900
        lowpass_8_1     d0,  d1,  q12, narrow=0
901

    
902
        mov             ip,  #-16
903
        add             r4,  r4,  ip
904
        vld1.64         {d30,d31}, [r4,:128], ip
905
        vld1.64         {d20,d21}, [r4,:128], ip
906
        vld1.64         {d18,d19}, [r4,:128], ip
907
        vld1.64         {d16,d17}, [r4,:128], ip
908
        vld1.64         {d14,d15}, [r4,:128], ip
909
        vld1.64         {d12,d13}, [r4,:128], ip
910
        vld1.64         {d10,d11}, [r4,:128], ip
911
        vld1.64         {d8, d9},  [r4,:128], ip
912
        vld1.64         {d6, d7},  [r4,:128], ip
913
        vld1.64         {d4, d5},  [r4,:128], ip
914
        vld1.64         {d2, d3},  [r4,:128], ip
915
        vld1.64         {d0, d1},  [r4,:128]
916

    
917
        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
918
        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
919

    
920
        swap4           d17, d19, d21, d31, d24, d26, d28, d22
921
        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
922

    
923
        vst1.64         {d30,d31}, [r4,:128]!
924
        vst1.64         {d6, d7},  [r4,:128]!
925
        vst1.64         {d20,d21}, [r4,:128]!
926
        vst1.64         {d4, d5},  [r4,:128]!
927
        vst1.64         {d18,d19}, [r4,:128]!
928
        vst1.64         {d2, d3},  [r4,:128]!
929
        vst1.64         {d16,d17}, [r4,:128]!
930
        vst1.64         {d0, d1},  [r4,:128]
931

    
932
        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
933
        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
934
        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
935
        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
936

    
937
        vld1.64         {d16,d17}, [r4,:128], ip
938
        vld1.64         {d30,d31}, [r4,:128], ip
939
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
940
        vld1.64         {d16,d17}, [r4,:128], ip
941
        vld1.64         {d30,d31}, [r4,:128], ip
942
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
943
        vld1.64         {d16,d17}, [r4,:128], ip
944
        vld1.64         {d30,d31}, [r4,:128], ip
945
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
946
        vld1.64         {d16,d17}, [r4,:128], ip
947
        vld1.64         {d30,d31}, [r4,:128]
948
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
949

    
950
        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
951

    
952
        bx              lr
953
        .endfunc
954

    
955
function put_h264_qpel8_hv_lowpass_neon
956
        mov             r10, lr
957
        bl              put_h264_qpel8_hv_lowpass_neon_top
958
        vst1.64         {d12},     [r0,:64], r2
959
        vst1.64         {d13},     [r0,:64], r2
960
        vst1.64         {d14},     [r0,:64], r2
961
        vst1.64         {d15},     [r0,:64], r2
962
        vst1.64         {d8},      [r0,:64], r2
963
        vst1.64         {d9},      [r0,:64], r2
964
        vst1.64         {d10},     [r0,:64], r2
965
        vst1.64         {d11},     [r0,:64], r2
966

    
967
        mov             lr,  r10
968
        bx              lr
969
        .endfunc
970

    
971
function put_h264_qpel8_hv_lowpass_l2_neon
972
        mov             r10, lr
973
        bl              put_h264_qpel8_hv_lowpass_neon_top
974

    
975
        vld1.64         {d0, d1},  [r2,:128]!
976
        vld1.64         {d2, d3},  [r2,:128]!
977
        vrhadd.u8       q0,  q0,  q6
978
        vld1.64         {d4, d5},  [r2,:128]!
979
        vrhadd.u8       q1,  q1,  q7
980
        vld1.64         {d6, d7},  [r2,:128]!
981
        vrhadd.u8       q2,  q2,  q4
982

    
983
        vst1.64         {d0},      [r0,:64], r3
984
        vrhadd.u8       q3,  q3,  q5
985
        vst1.64         {d1},      [r0,:64], r3
986
        vst1.64         {d2},      [r0,:64], r3
987
        vst1.64         {d3},      [r0,:64], r3
988
        vst1.64         {d4},      [r0,:64], r3
989
        vst1.64         {d5},      [r0,:64], r3
990
        vst1.64         {d6},      [r0,:64], r3
991
        vst1.64         {d7},      [r0,:64], r3
992

    
993
        mov             lr,  r10
994
        bx              lr
995
        .endfunc
996

    
997
function put_h264_qpel16_hv_lowpass_neon
998
        mov             r9,  lr
999
        bl              put_h264_qpel8_hv_lowpass_neon
1000
        sub             r1,  r1,  r3, lsl #2
1001
        bl              put_h264_qpel8_hv_lowpass_neon
1002
        sub             r1,  r1,  r3, lsl #4
1003
        sub             r1,  r1,  r3, lsl #2
1004
        add             r1,  r1,  #8
1005
        sub             r0,  r0,  r2, lsl #4
1006
        add             r0,  r0,  #8
1007
        bl              put_h264_qpel8_hv_lowpass_neon
1008
        sub             r1,  r1,  r3, lsl #2
1009
        mov             lr,  r9
1010
        b               put_h264_qpel8_hv_lowpass_neon
1011
        .endfunc
1012

    
1013
function put_h264_qpel16_hv_lowpass_l2_neon
1014
        mov             r9,  lr
1015
        sub             r2,  r4,  #256
1016
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1017
        sub             r1,  r1,  r3, lsl #2
1018
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1019
        sub             r1,  r1,  r3, lsl #4
1020
        sub             r1,  r1,  r3, lsl #2
1021
        add             r1,  r1,  #8
1022
        sub             r0,  r0,  r3, lsl #4
1023
        add             r0,  r0,  #8
1024
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1025
        sub             r1,  r1,  r3, lsl #2
1026
        mov             lr,  r9
1027
        b               put_h264_qpel8_hv_lowpass_l2_neon
1028
        .endfunc
1029

    
1030
function ff_put_h264_qpel8_mc10_neon, export=1
1031
        lowpass_const   r3
1032
        mov             r3,  r1
1033
        sub             r1,  r1,  #2
1034
        mov             ip,  #8
1035
        b               put_h264_qpel8_h_lowpass_l2_neon
1036
        .endfunc
1037

    
1038
function ff_put_h264_qpel8_mc20_neon, export=1
1039
        lowpass_const   r3
1040
        sub             r1,  r1,  #2
1041
        mov             r3,  r2
1042
        mov             ip,  #8
1043
        b               put_h264_qpel8_h_lowpass_neon
1044
        .endfunc
1045

    
1046
function ff_put_h264_qpel8_mc30_neon, export=1
1047
        lowpass_const   r3
1048
        add             r3,  r1,  #1
1049
        sub             r1,  r1,  #2
1050
        mov             ip,  #8
1051
        b               put_h264_qpel8_h_lowpass_l2_neon
1052
        .endfunc
1053

    
1054
function ff_put_h264_qpel8_mc01_neon, export=1
1055
        push            {lr}
1056
        mov             ip,  r1
1057
put_h264_qpel8_mc01:
1058
        lowpass_const   r3
1059
        mov             r3,  r2
1060
        sub             r1,  r1,  r2, lsl #1
1061
        vpush           {d8-d15}
1062
        bl              put_h264_qpel8_v_lowpass_l2_neon
1063
        vpop            {d8-d15}
1064
        pop             {pc}
1065
        .endfunc
1066

    
1067
function ff_put_h264_qpel8_mc11_neon, export=1
1068
        push            {r0, r1, r2, lr}
1069
put_h264_qpel8_mc11:
1070
        lowpass_const   r3
1071
        sub             sp,  sp,  #64
1072
        mov             r0,  sp
1073
        sub             r1,  r1,  #2
1074
        mov             r3,  #8
1075
        mov             ip,  #8
1076
        vpush           {d8-d15}
1077
        bl              put_h264_qpel8_h_lowpass_neon
1078
        ldrd            r0,  [sp, #128]
1079
        mov             r3,  r2
1080
        add             ip,  sp,  #64
1081
        sub             r1,  r1,  r2, lsl #1
1082
        mov             r2,  #8
1083
        bl              put_h264_qpel8_v_lowpass_l2_neon
1084
        vpop            {d8-d15}
1085
        add             sp,  sp,  #76
1086
        pop             {pc}
1087
        .endfunc
1088

    
1089
function ff_put_h264_qpel8_mc21_neon, export=1
1090
        push            {r0, r1, r4, r10, r11, lr}
1091
put_h264_qpel8_mc21:
1092
        lowpass_const   r3
1093
        mov             r11, sp
1094
        bic             sp,  sp,  #15
1095
        sub             sp,  sp,  #(8*8+16*12)
1096
        sub             r1,  r1,  #2
1097
        mov             r3,  #8
1098
        mov             r0,  sp
1099
        mov             ip,  #8
1100
        vpush           {d8-d15}
1101
        bl              put_h264_qpel8_h_lowpass_neon
1102
        mov             r4,  r0
1103
        ldrd            r0,  [r11]
1104
        sub             r1,  r1,  r2, lsl #1
1105
        sub             r1,  r1,  #2
1106
        mov             r3,  r2
1107
        sub             r2,  r4,  #64
1108
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1109
        vpop            {d8-d15}
1110
        add             sp,  r11,  #8
1111
        pop             {r4, r10, r11, pc}
1112
        .endfunc
1113

    
1114
function ff_put_h264_qpel8_mc31_neon, export=1
1115
        add             r1,  r1,  #1
1116
        push            {r0, r1, r2, lr}
1117
        sub             r1,  r1,  #1
1118
        b               put_h264_qpel8_mc11
1119
        .endfunc
1120

    
1121
function ff_put_h264_qpel8_mc02_neon, export=1
1122
        push            {lr}
1123
        lowpass_const   r3
1124
        sub             r1,  r1,  r2, lsl #1
1125
        mov             r3,  r2
1126
        vpush           {d8-d15}
1127
        bl              put_h264_qpel8_v_lowpass_neon
1128
        vpop            {d8-d15}
1129
        pop             {pc}
1130
        .endfunc
1131

    
1132
function ff_put_h264_qpel8_mc12_neon, export=1
1133
        push            {r0, r1, r4, r10, r11, lr}
1134
put_h264_qpel8_mc12:
1135
        lowpass_const   r3
1136
        mov             r11, sp
1137
        bic             sp,  sp,  #15
1138
        sub             sp,  sp,  #(8*8+16*12)
1139
        sub             r1,  r1,  r2, lsl #1
1140
        mov             r3,  r2
1141
        mov             r2,  #8
1142
        mov             r0,  sp
1143
        vpush           {d8-d15}
1144
        bl              put_h264_qpel8_v_lowpass_neon
1145
        mov             r4,  r0
1146
        ldrd            r0,  [r11]
1147
        sub             r1,  r1,  r3, lsl #1
1148
        sub             r1,  r1,  #2
1149
        sub             r2,  r4,  #64
1150
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1151
        vpop            {d8-d15}
1152
        add             sp,  r11,  #8
1153
        pop             {r4, r10, r11, pc}
1154
        .endfunc
1155

    
1156
function ff_put_h264_qpel8_mc22_neon, export=1
1157
        push            {r4, r10, r11, lr}
1158
        mov             r11, sp
1159
        bic             sp,  sp,  #15
1160
        sub             r1,  r1,  r2, lsl #1
1161
        sub             r1,  r1,  #2
1162
        mov             r3,  r2
1163
        sub             sp,  sp,  #(16*12)
1164
        mov             r4,  sp
1165
        vpush           {d8-d15}
1166
        bl              put_h264_qpel8_hv_lowpass_neon
1167
        vpop            {d8-d15}
1168
        mov             sp,  r11
1169
        pop             {r4, r10, r11, pc}
1170
        .endfunc
1171

    
1172
function ff_put_h264_qpel8_mc32_neon, export=1
1173
        push            {r0, r1, r4, r10, r11, lr}
1174
        add             r1,  r1,  #1
1175
        b               put_h264_qpel8_mc12
1176
        .endfunc
1177

    
1178
function ff_put_h264_qpel8_mc03_neon, export=1
1179
        push            {lr}
1180
        add             ip,  r1,  r2
1181
        b               put_h264_qpel8_mc01
1182
        .endfunc
1183

    
1184
function ff_put_h264_qpel8_mc13_neon, export=1
1185
        push            {r0, r1, r2, lr}
1186
        add             r1,  r1,  r2
1187
        b               put_h264_qpel8_mc11
1188
        .endfunc
1189

    
1190
function ff_put_h264_qpel8_mc23_neon, export=1
1191
        push            {r0, r1, r4, r10, r11, lr}
1192
        add             r1,  r1,  r2
1193
        b               put_h264_qpel8_mc21
1194
        .endfunc
1195

    
1196
function ff_put_h264_qpel8_mc33_neon, export=1
1197
        add             r1,  r1,  #1
1198
        push            {r0, r1, r2, lr}
1199
        add             r1,  r1,  r2
1200
        sub             r1,  r1,  #1
1201
        b               put_h264_qpel8_mc11
1202
        .endfunc
1203

    
1204
function ff_put_h264_qpel16_mc10_neon, export=1
1205
        lowpass_const   r3
1206
        mov             r3,  r1
1207
        sub             r1,  r1,  #2
1208
        b               put_h264_qpel16_h_lowpass_l2_neon
1209
        .endfunc
1210

    
1211
function ff_put_h264_qpel16_mc20_neon, export=1
1212
        lowpass_const   r3
1213
        sub             r1,  r1,  #2
1214
        mov             r3,  r2
1215
        b               put_h264_qpel16_h_lowpass_neon
1216
        .endfunc
1217

    
1218
function ff_put_h264_qpel16_mc30_neon, export=1
1219
        lowpass_const   r3
1220
        add             r3,  r1,  #1
1221
        sub             r1,  r1,  #2
1222
        b               put_h264_qpel16_h_lowpass_l2_neon
1223
        .endfunc
1224

    
1225
function ff_put_h264_qpel16_mc01_neon, export=1
1226
        push            {r4, lr}
1227
        mov             ip,  r1
1228
put_h264_qpel16_mc01:
1229
        lowpass_const   r3
1230
        mov             r3,  r2
1231
        sub             r1,  r1,  r2, lsl #1
1232
        vpush           {d8-d15}
1233
        bl              put_h264_qpel16_v_lowpass_l2_neon
1234
        vpop            {d8-d15}
1235
        pop             {r4, pc}
1236
        .endfunc
1237

    
1238
function ff_put_h264_qpel16_mc11_neon, export=1
1239
        push            {r0, r1, r4, lr}
1240
put_h264_qpel16_mc11:
1241
        lowpass_const   r3
1242
        sub             sp,  sp,  #256
1243
        mov             r0,  sp
1244
        sub             r1,  r1,  #2
1245
        mov             r3,  #16
1246
        vpush           {d8-d15}
1247
        bl              put_h264_qpel16_h_lowpass_neon
1248
        add             r0,  sp,  #256
1249
        ldrd            r0,  [r0, #64]
1250
        mov             r3,  r2
1251
        add             ip,  sp,  #64
1252
        sub             r1,  r1,  r2, lsl #1
1253
        mov             r2,  #16
1254
        bl              put_h264_qpel16_v_lowpass_l2_neon
1255
        vpop            {d8-d15}
1256
        add             sp,  sp,  #(256+8)
1257
        pop             {r4, pc}
1258
        .endfunc
1259

    
1260
function ff_put_h264_qpel16_mc21_neon, export=1
1261
        push            {r0, r1, r4-r5, r9-r11, lr}
1262
put_h264_qpel16_mc21:
1263
        lowpass_const   r3
1264
        mov             r11, sp
1265
        bic             sp,  sp,  #15
1266
        sub             sp,  sp,  #(16*16+16*12)
1267
        sub             r1,  r1,  #2
1268
        mov             r0,  sp
1269
        vpush           {d8-d15}
1270
        bl              put_h264_qpel16_h_lowpass_neon_packed
1271
        mov             r4,  r0
1272
        ldrd            r0,  [r11]
1273
        sub             r1,  r1,  r2, lsl #1
1274
        sub             r1,  r1,  #2
1275
        mov             r3,  r2
1276
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1277
        vpop            {d8-d15}
1278
        add             sp,  r11,  #8
1279
        pop             {r4-r5, r9-r11, pc}
1280
        .endfunc
1281

    
1282
function ff_put_h264_qpel16_mc31_neon, export=1
1283
        add             r1,  r1,  #1
1284
        push            {r0, r1, r4, lr}
1285
        sub             r1,  r1,  #1
1286
        b               put_h264_qpel16_mc11
1287
        .endfunc
1288

    
1289
function ff_put_h264_qpel16_mc02_neon, export=1
1290
        push            {r4, lr}
1291
        lowpass_const   r3
1292
        sub             r1,  r1,  r2, lsl #1
1293
        mov             r3,  r2
1294
        vpush           {d8-d15}
1295
        bl              put_h264_qpel16_v_lowpass_neon
1296
        vpop            {d8-d15}
1297
        pop             {r4, pc}
1298
        .endfunc
1299

    
1300
function ff_put_h264_qpel16_mc12_neon, export=1
1301
        push            {r0, r1, r4-r5, r9-r11, lr}
1302
put_h264_qpel16_mc12:
1303
        lowpass_const   r3
1304
        mov             r11, sp
1305
        bic             sp,  sp,  #15
1306
        sub             sp,  sp,  #(16*16+16*12)
1307
        sub             r1,  r1,  r2, lsl #1
1308
        mov             r0,  sp
1309
        mov             r3,  r2
1310
        vpush           {d8-d15}
1311
        bl              put_h264_qpel16_v_lowpass_neon_packed
1312
        mov             r4,  r0
1313
        ldrd            r0,  [r11]
1314
        sub             r1,  r1,  r3, lsl #1
1315
        sub             r1,  r1,  #2
1316
        mov             r2,  r3
1317
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1318
        vpop            {d8-d15}
1319
        add             sp,  r11,  #8
1320
        pop             {r4-r5, r9-r11, pc}
1321
        .endfunc
1322

    
1323
function ff_put_h264_qpel16_mc22_neon, export=1
1324
        push            {r4, r9-r11, lr}
1325
        lowpass_const   r3
1326
        mov             r11, sp
1327
        bic             sp,  sp,  #15
1328
        sub             r1,  r1,  r2, lsl #1
1329
        sub             r1,  r1,  #2
1330
        mov             r3,  r2
1331
        sub             sp,  sp,  #(16*12)
1332
        mov             r4,  sp
1333
        vpush           {d8-d15}
1334
        bl              put_h264_qpel16_hv_lowpass_neon
1335
        vpop            {d8-d15}
1336
        mov             sp,  r11
1337
        pop             {r4, r9-r11, pc}
1338
        .endfunc
1339

    
1340
function ff_put_h264_qpel16_mc32_neon, export=1
1341
        push            {r0, r1, r4-r5, r9-r11, lr}
1342
        add             r1,  r1,  #1
1343
        b               put_h264_qpel16_mc12
1344
        .endfunc
1345

    
1346
function ff_put_h264_qpel16_mc03_neon, export=1
1347
        push            {r4, lr}
1348
        add             ip,  r1,  r2
1349
        b               put_h264_qpel16_mc01
1350
        .endfunc
1351

    
1352
function ff_put_h264_qpel16_mc13_neon, export=1
1353
        push            {r0, r1, r4, lr}
1354
        add             r1,  r1,  r2
1355
        b               put_h264_qpel16_mc11
1356
        .endfunc
1357

    
1358
function ff_put_h264_qpel16_mc23_neon, export=1
1359
        push            {r0, r1, r4-r5, r9-r11, lr}
1360
        add             r1,  r1,  r2
1361
        b               put_h264_qpel16_mc21
1362
        .endfunc
1363

    
1364
function ff_put_h264_qpel16_mc33_neon, export=1
1365
        add             r1,  r1,  #1
1366
        push            {r0, r1, r4, lr}
1367
        add             r1,  r1,  r2
1368
        sub             r1,  r1,  #1
1369
        b               put_h264_qpel16_mc11
1370
        .endfunc