Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264dsp_neon.S @ 0115b3ea

History | View | Annotate | Download (55.2 KB)

1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24
        vtrn.32         \r0, \r4
25
        vtrn.32         \r1, \r5
26
        vtrn.32         \r2, \r6
27
        vtrn.32         \r3, \r7
28
        vtrn.16         \r0, \r2
29
        vtrn.16         \r1, \r3
30
        vtrn.16         \r4, \r6
31
        vtrn.16         \r5, \r7
32
        vtrn.8          \r0, \r1
33
        vtrn.8          \r2, \r3
34
        vtrn.8          \r4, \r5
35
        vtrn.8          \r6, \r7
36
        .endm
37

    
38
        .macro transpose_4x4 r0 r1 r2 r3
39
        vtrn.16         \r0, \r2
40
        vtrn.16         \r1, \r3
41
        vtrn.8          \r0, \r1
42
        vtrn.8          \r2, \r3
43
        .endm
44

    
45
        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46
        vswp            \r0, \r4
47
        vswp            \r1, \r5
48
        vswp            \r2, \r6
49
        vswp            \r3, \r7
50
        .endm
51

    
52
        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53
        vtrn.32         \r0, \r2
54
        vtrn.32         \r1, \r3
55
        vtrn.32         \r4, \r6
56
        vtrn.32         \r5, \r7
57
        vtrn.16         \r0, \r1
58
        vtrn.16         \r2, \r3
59
        vtrn.16         \r4, \r5
60
        vtrn.16         \r6, \r7
61
        .endm
62

    
63
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64
        .macro  h264_chroma_mc8 type
65
function ff_\type\()_h264_chroma_mc8_neon, export=1
66
        push            {r4-r7, lr}
67
        ldrd            r4,  [sp, #20]
68
.ifc \type,avg
69
        mov             lr,  r0
70
.endif
71
        pld             [r1]
72
        pld             [r1, r2]
73

    
74
        muls            r7,  r4,  r5
75
        rsb             r6,  r7,  r5,  lsl #3
76
        rsb             ip,  r7,  r4,  lsl #3
77
        sub             r4,  r7,  r4,  lsl #3
78
        sub             r4,  r4,  r5,  lsl #3
79
        add             r4,  r4,  #64
80

    
81
        beq             2f
82

    
83
        add             r5,  r1,  r2
84

    
85
        vdup.8          d0,  r4
86
        lsl             r4,  r2,  #1
87
        vdup.8          d1,  ip
88
        vld1.64         {d4, d5}, [r1], r4
89
        vdup.8          d2,  r6
90
        vld1.64         {d6, d7}, [r5], r4
91
        vdup.8          d3,  r7
92

    
93
        vext.8          d5,  d4,  d5,  #1
94
        vext.8          d7,  d6,  d7,  #1
95

    
96
1:      pld             [r5]
97
        vmull.u8        q8,  d4,  d0
98
        vmlal.u8        q8,  d5,  d1
99
        vld1.64         {d4, d5}, [r1], r4
100
        vmlal.u8        q8,  d6,  d2
101
        vext.8          d5,  d4,  d5,  #1
102
        vmlal.u8        q8,  d7,  d3
103
        vmull.u8        q9,  d6,  d0
104
        subs            r3,  r3,  #2
105
        vmlal.u8        q9,  d7,  d1
106
        vmlal.u8        q9,  d4,  d2
107
        vmlal.u8        q9,  d5,  d3
108
        vrshrn.u16      d16, q8,  #6
109
        vld1.64         {d6, d7}, [r5], r4
110
        pld             [r1]
111
        vrshrn.u16      d17, q9,  #6
112
.ifc \type,avg
113
        vld1.64         {d20}, [lr,:64], r2
114
        vld1.64         {d21}, [lr,:64], r2
115
        vrhadd.u8       q8,  q8,  q10
116
.endif
117
        vext.8          d7,  d6,  d7,  #1
118
        vst1.64         {d16}, [r0,:64], r2
119
        vst1.64         {d17}, [r0,:64], r2
120
        bgt             1b
121

    
122
        pop             {r4-r7, pc}
123

    
124
2:      tst             r6,  r6
125
        add             ip,  ip,  r6
126
        vdup.8          d0,  r4
127
        vdup.8          d1,  ip
128

    
129
        beq             4f
130

    
131
        add             r5,  r1,  r2
132
        lsl             r4,  r2,  #1
133
        vld1.64         {d4}, [r1], r4
134
        vld1.64         {d6}, [r5], r4
135

    
136
3:      pld             [r5]
137
        vmull.u8        q8,  d4,  d0
138
        vmlal.u8        q8,  d6,  d1
139
        vld1.64         {d4}, [r1], r4
140
        vmull.u8        q9,  d6,  d0
141
        vmlal.u8        q9,  d4,  d1
142
        vld1.64         {d6}, [r5], r4
143
        vrshrn.u16      d16, q8,  #6
144
        vrshrn.u16      d17, q9,  #6
145
.ifc \type,avg
146
        vld1.64         {d20}, [lr,:64], r2
147
        vld1.64         {d21}, [lr,:64], r2
148
        vrhadd.u8       q8,  q8,  q10
149
.endif
150
        subs            r3,  r3,  #2
151
        pld             [r1]
152
        vst1.64         {d16}, [r0,:64], r2
153
        vst1.64         {d17}, [r0,:64], r2
154
        bgt             3b
155

    
156
        pop             {r4-r7, pc}
157

    
158
4:      vld1.64         {d4, d5}, [r1], r2
159
        vld1.64         {d6, d7}, [r1], r2
160
        vext.8          d5,  d4,  d5,  #1
161
        vext.8          d7,  d6,  d7,  #1
162

    
163
5:      pld             [r1]
164
        subs            r3,  r3,  #2
165
        vmull.u8        q8,  d4,  d0
166
        vmlal.u8        q8,  d5,  d1
167
        vld1.64         {d4, d5}, [r1], r2
168
        vmull.u8        q9,  d6,  d0
169
        vmlal.u8        q9,  d7,  d1
170
        pld             [r1]
171
        vext.8          d5,  d4,  d5,  #1
172
        vrshrn.u16      d16, q8,  #6
173
        vrshrn.u16      d17, q9,  #6
174
.ifc \type,avg
175
        vld1.64         {d20}, [lr,:64], r2
176
        vld1.64         {d21}, [lr,:64], r2
177
        vrhadd.u8       q8,  q8,  q10
178
.endif
179
        vld1.64         {d6, d7}, [r1], r2
180
        vext.8          d7,  d6,  d7,  #1
181
        vst1.64         {d16}, [r0,:64], r2
182
        vst1.64         {d17}, [r0,:64], r2
183
        bgt             5b
184

    
185
        pop             {r4-r7, pc}
186
        .endfunc
187
        .endm
188

    
189
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190
        .macro  h264_chroma_mc4 type
191
function ff_\type\()_h264_chroma_mc4_neon, export=1
192
        push            {r4-r7, lr}
193
        ldrd            r4,  [sp, #20]
194
.ifc \type,avg
195
        mov             lr,  r0
196
.endif
197
        pld             [r1]
198
        pld             [r1, r2]
199

    
200
        muls            r7,  r4,  r5
201
        rsb             r6,  r7,  r5,  lsl #3
202
        rsb             ip,  r7,  r4,  lsl #3
203
        sub             r4,  r7,  r4,  lsl #3
204
        sub             r4,  r4,  r5,  lsl #3
205
        add             r4,  r4,  #64
206

    
207
        beq             2f
208

    
209
        add             r5,  r1,  r2
210

    
211
        vdup.8          d0,  r4
212
        lsl             r4,  r2,  #1
213
        vdup.8          d1,  ip
214
        vld1.64         {d4},     [r1], r4
215
        vdup.8          d2,  r6
216
        vld1.64         {d6},     [r5], r4
217
        vdup.8          d3,  r7
218

    
219
        vext.8          d5,  d4,  d5,  #1
220
        vext.8          d7,  d6,  d7,  #1
221
        vtrn.32         d4,  d5
222
        vtrn.32         d6,  d7
223

    
224
        vtrn.32         d0,  d1
225
        vtrn.32         d2,  d3
226

    
227
1:      pld             [r5]
228
        vmull.u8        q8,  d4,  d0
229
        vmlal.u8        q8,  d6,  d2
230
        vld1.64         {d4},     [r1], r4
231
        vext.8          d5,  d4,  d5,  #1
232
        vtrn.32         d4,  d5
233
        vmull.u8        q9,  d6,  d0
234
        vmlal.u8        q9,  d4,  d2
235
        vld1.64         {d6},     [r5], r4
236
        vadd.i16        d16, d16, d17
237
        vadd.i16        d17, d18, d19
238
        vrshrn.u16      d16, q8,  #6
239
        subs            r3,  r3,  #2
240
        pld             [r1]
241
.ifc \type,avg
242
        vld1.32         {d20[0]}, [lr,:32], r2
243
        vld1.32         {d20[1]}, [lr,:32], r2
244
        vrhadd.u8       d16, d16, d20
245
.endif
246
        vext.8          d7,  d6,  d7,  #1
247
        vtrn.32         d6,  d7
248
        vst1.32         {d16[0]}, [r0,:32], r2
249
        vst1.32         {d16[1]}, [r0,:32], r2
250
        bgt             1b
251

    
252
        pop             {r4-r7, pc}
253

    
254
2:      tst             r6,  r6
255
        add             ip,  ip,  r6
256
        vdup.8          d0,  r4
257
        vdup.8          d1,  ip
258
        vtrn.32         d0,  d1
259

    
260
        beq             4f
261

    
262
        vext.32         d1,  d0,  d1,  #1
263
        add             r5,  r1,  r2
264
        lsl             r4,  r2,  #1
265
        vld1.32         {d4[0]},  [r1], r4
266
        vld1.32         {d4[1]},  [r5], r4
267

    
268
3:      pld             [r5]
269
        vmull.u8        q8,  d4,  d0
270
        vld1.32         {d4[0]},  [r1], r4
271
        vmull.u8        q9,  d4,  d1
272
        vld1.32         {d4[1]},  [r5], r4
273
        vadd.i16        d16, d16, d17
274
        vadd.i16        d17, d18, d19
275
        vrshrn.u16      d16, q8,  #6
276
.ifc \type,avg
277
        vld1.32         {d20[0]}, [lr,:32], r2
278
        vld1.32         {d20[1]}, [lr,:32], r2
279
        vrhadd.u8       d16, d16, d20
280
.endif
281
        subs            r3,  r3,  #2
282
        pld             [r1]
283
        vst1.32         {d16[0]}, [r0,:32], r2
284
        vst1.32         {d16[1]}, [r0,:32], r2
285
        bgt             3b
286

    
287
        pop             {r4-r7, pc}
288

    
289
4:      vld1.64         {d4},     [r1], r2
290
        vld1.64         {d6},     [r1], r2
291
        vext.8          d5,  d4,  d5,  #1
292
        vext.8          d7,  d6,  d7,  #1
293
        vtrn.32         d4,  d5
294
        vtrn.32         d6,  d7
295

    
296
5:      vmull.u8        q8,  d4,  d0
297
        vmull.u8        q9,  d6,  d0
298
        subs            r3,  r3,  #2
299
        vld1.64         {d4},     [r1], r2
300
        vext.8          d5,  d4,  d5,  #1
301
        vtrn.32         d4,  d5
302
        vadd.i16        d16, d16, d17
303
        vadd.i16        d17, d18, d19
304
        pld             [r1]
305
        vrshrn.u16      d16, q8,  #6
306
.ifc \type,avg
307
        vld1.32         {d20[0]}, [lr,:32], r2
308
        vld1.32         {d20[1]}, [lr,:32], r2
309
        vrhadd.u8       d16, d16, d20
310
.endif
311
        vld1.64         {d6},     [r1], r2
312
        vext.8          d7,  d6,  d7,  #1
313
        vtrn.32         d6,  d7
314
        pld             [r1]
315
        vst1.32         {d16[0]}, [r0,:32], r2
316
        vst1.32         {d16[1]}, [r0,:32], r2
317
        bgt             5b
318

    
319
        pop             {r4-r7, pc}
320
        .endfunc
321
        .endm
322

    
323
        .text
324
        .align
325

    
326
        h264_chroma_mc8 put
327
        h264_chroma_mc8 avg
328
        h264_chroma_mc4 put
329
        h264_chroma_mc4 avg
330

    
331
        /* H.264 loop filter */
332

    
333
        .macro h264_loop_filter_start
334
        ldr             ip,  [sp]
335
        tst             r2,  r2
336
        ldr             ip,  [ip]
337
        tstne           r3,  r3
338
        vmov.32         d24[0], ip
339
        and             ip,  ip,  ip, lsl #16
340
        bxeq            lr
341
        ands            ip,  ip,  ip, lsl #8
342
        bxlt            lr
343
        .endm
344

    
345
        .macro align_push_regs
346
        and             ip,  sp,  #15
347
        add             ip,  ip,  #32
348
        sub             sp,  sp,  ip
349
        vst1.64         {d12-d15}, [sp,:128]
350
        sub             sp,  sp,  #32
351
        vst1.64         {d8-d11},  [sp,:128]
352
        .endm
353

    
354
        .macro align_pop_regs
355
        vld1.64         {d8-d11},  [sp,:128]!
356
        vld1.64         {d12-d15}, [sp,:128], ip
357
        .endm
358

    
359
        .macro h264_loop_filter_luma
360
        vdup.8          q11, r2         @ alpha
361
        vmovl.u8        q12, d24
362
        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
363
        vmovl.u16       q12, d24
364
        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
365
        vsli.16         q12, q12, #8
366
        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
367
        vsli.32         q12, q12, #16
368
        vclt.u8         q6,  q6,  q11   @ < alpha
369
        vdup.8          q11, r3         @ beta
370
        vclt.s8         q7,  q12, #0
371
        vclt.u8         q14, q14, q11   @ < beta
372
        vclt.u8         q15, q15, q11   @ < beta
373
        vbic            q6,  q6,  q7
374
        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
375
        vand            q6,  q6,  q14
376
        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
377
        vclt.u8         q4,  q4,  q11   @ < beta
378
        vand            q6,  q6,  q15
379
        vclt.u8         q5,  q5,  q11   @ < beta
380
        vand            q4,  q4,  q6
381
        vand            q5,  q5,  q6
382
        vand            q12, q12, q6
383
        vrhadd.u8       q14, q8,  q0
384
        vsub.i8         q6,  q12, q4
385
        vqadd.u8        q7,  q9,  q12
386
        vhadd.u8        q10, q10, q14
387
        vsub.i8         q6,  q6,  q5
388
        vhadd.u8        q14, q2,  q14
389
        vmin.u8         q7,  q7,  q10
390
        vqsub.u8        q11, q9,  q12
391
        vqadd.u8        q2,  q1,  q12
392
        vmax.u8         q7,  q7,  q11
393
        vqsub.u8        q11, q1,  q12
394
        vmin.u8         q14, q2,  q14
395
        vmovl.u8        q2,  d0
396
        vmax.u8         q14, q14, q11
397
        vmovl.u8        q10, d1
398
        vsubw.u8        q2,  q2,  d16
399
        vsubw.u8        q10, q10, d17
400
        vshl.i16        q2,  q2,  #2
401
        vshl.i16        q10, q10, #2
402
        vaddw.u8        q2,  q2,  d18
403
        vaddw.u8        q10, q10, d19
404
        vsubw.u8        q2,  q2,  d2
405
        vsubw.u8        q10, q10, d3
406
        vrshrn.i16      d4,  q2,  #3
407
        vrshrn.i16      d5,  q10, #3
408
        vbsl            q4,  q7,  q9
409
        vbsl            q5,  q14, q1
410
        vneg.s8         q7,  q6
411
        vmovl.u8        q14, d16
412
        vmin.s8         q2,  q2,  q6
413
        vmovl.u8        q6,  d17
414
        vmax.s8         q2,  q2,  q7
415
        vmovl.u8        q11, d0
416
        vmovl.u8        q12, d1
417
        vaddw.s8        q14, q14, d4
418
        vaddw.s8        q6,  q6,  d5
419
        vsubw.s8        q11, q11, d4
420
        vsubw.s8        q12, q12, d5
421
        vqmovun.s16     d16, q14
422
        vqmovun.s16     d17, q6
423
        vqmovun.s16     d0,  q11
424
        vqmovun.s16     d1,  q12
425
        .endm
426

    
427
function ff_h264_v_loop_filter_luma_neon, export=1
428
        h264_loop_filter_start
429

    
430
        vld1.64         {d0, d1},  [r0,:128], r1
431
        vld1.64         {d2, d3},  [r0,:128], r1
432
        vld1.64         {d4, d5},  [r0,:128], r1
433
        sub             r0,  r0,  r1, lsl #2
434
        sub             r0,  r0,  r1, lsl #1
435
        vld1.64         {d20,d21}, [r0,:128], r1
436
        vld1.64         {d18,d19}, [r0,:128], r1
437
        vld1.64         {d16,d17}, [r0,:128], r1
438

    
439
        align_push_regs
440

    
441
        h264_loop_filter_luma
442

    
443
        sub             r0,  r0,  r1, lsl #1
444
        vst1.64         {d8, d9},  [r0,:128], r1
445
        vst1.64         {d16,d17}, [r0,:128], r1
446
        vst1.64         {d0, d1},  [r0,:128], r1
447
        vst1.64         {d10,d11}, [r0,:128]
448

    
449
        align_pop_regs
450
        bx              lr
451
        .endfunc
452

    
453
function ff_h264_h_loop_filter_luma_neon, export=1
454
        h264_loop_filter_start
455

    
456
        sub             r0,  r0,  #4
457
        vld1.64         {d6},  [r0], r1
458
        vld1.64         {d20}, [r0], r1
459
        vld1.64         {d18}, [r0], r1
460
        vld1.64         {d16}, [r0], r1
461
        vld1.64         {d0},  [r0], r1
462
        vld1.64         {d2},  [r0], r1
463
        vld1.64         {d4},  [r0], r1
464
        vld1.64         {d26}, [r0], r1
465
        vld1.64         {d7},  [r0], r1
466
        vld1.64         {d21}, [r0], r1
467
        vld1.64         {d19}, [r0], r1
468
        vld1.64         {d17}, [r0], r1
469
        vld1.64         {d1},  [r0], r1
470
        vld1.64         {d3},  [r0], r1
471
        vld1.64         {d5},  [r0], r1
472
        vld1.64         {d27}, [r0], r1
473

    
474
        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
475

    
476
        align_push_regs
477

    
478
        h264_loop_filter_luma
479

    
480
        transpose_4x4   q4, q8, q0, q5
481

    
482
        sub             r0,  r0,  r1, lsl #4
483
        add             r0,  r0,  #2
484
        vst1.32         {d8[0]},  [r0], r1
485
        vst1.32         {d16[0]}, [r0], r1
486
        vst1.32         {d0[0]},  [r0], r1
487
        vst1.32         {d10[0]}, [r0], r1
488
        vst1.32         {d8[1]},  [r0], r1
489
        vst1.32         {d16[1]}, [r0], r1
490
        vst1.32         {d0[1]},  [r0], r1
491
        vst1.32         {d10[1]}, [r0], r1
492
        vst1.32         {d9[0]},  [r0], r1
493
        vst1.32         {d17[0]}, [r0], r1
494
        vst1.32         {d1[0]},  [r0], r1
495
        vst1.32         {d11[0]}, [r0], r1
496
        vst1.32         {d9[1]},  [r0], r1
497
        vst1.32         {d17[1]}, [r0], r1
498
        vst1.32         {d1[1]},  [r0], r1
499
        vst1.32         {d11[1]}, [r0], r1
500

    
501
        align_pop_regs
502
        bx              lr
503
        .endfunc
504

    
505
        .macro h264_loop_filter_chroma
506
        vdup.8          d22, r2         @ alpha
507
        vmovl.u8        q12, d24
508
        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
509
        vmovl.u8        q2,  d0
510
        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
511
        vsubw.u8        q2,  q2,  d16
512
        vsli.16         d24, d24, #8
513
        vshl.i16        q2,  q2,  #2
514
        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
515
        vaddw.u8        q2,  q2,  d18
516
        vclt.u8         d26, d26, d22   @ < alpha
517
        vsubw.u8        q2,  q2,  d2
518
        vdup.8          d22, r3         @ beta
519
        vclt.s8         d25, d24, #0
520
        vrshrn.i16      d4,  q2,  #3
521
        vclt.u8         d28, d28, d22   @ < beta
522
        vbic            d26, d26, d25
523
        vclt.u8         d30, d30, d22   @ < beta
524
        vand            d26, d26, d28
525
        vneg.s8         d25, d24
526
        vand            d26, d26, d30
527
        vmin.s8         d4,  d4,  d24
528
        vmovl.u8        q14, d16
529
        vand            d4,  d4,  d26
530
        vmax.s8         d4,  d4,  d25
531
        vmovl.u8        q11, d0
532
        vaddw.s8        q14, q14, d4
533
        vsubw.s8        q11, q11, d4
534
        vqmovun.s16     d16, q14
535
        vqmovun.s16     d0,  q11
536
        .endm
537

    
538
function ff_h264_v_loop_filter_chroma_neon, export=1
539
        h264_loop_filter_start
540

    
541
        sub             r0,  r0,  r1, lsl #1
542
        vld1.64         {d18}, [r0,:64], r1
543
        vld1.64         {d16}, [r0,:64], r1
544
        vld1.64         {d0},  [r0,:64], r1
545
        vld1.64         {d2},  [r0,:64]
546

    
547
        h264_loop_filter_chroma
548

    
549
        sub             r0,  r0,  r1, lsl #1
550
        vst1.64         {d16}, [r0,:64], r1
551
        vst1.64         {d0},  [r0,:64], r1
552

    
553
        bx              lr
554
        .endfunc
555

    
556
function ff_h264_h_loop_filter_chroma_neon, export=1
557
        h264_loop_filter_start
558

    
559
        sub             r0,  r0,  #2
560
        vld1.32         {d18[0]}, [r0], r1
561
        vld1.32         {d16[0]}, [r0], r1
562
        vld1.32         {d0[0]},  [r0], r1
563
        vld1.32         {d2[0]},  [r0], r1
564
        vld1.32         {d18[1]}, [r0], r1
565
        vld1.32         {d16[1]}, [r0], r1
566
        vld1.32         {d0[1]},  [r0], r1
567
        vld1.32         {d2[1]},  [r0], r1
568

    
569
        vtrn.16         d18, d0
570
        vtrn.16         d16, d2
571
        vtrn.8          d18, d16
572
        vtrn.8          d0,  d2
573

    
574
        h264_loop_filter_chroma
575

    
576
        vtrn.16         d18, d0
577
        vtrn.16         d16, d2
578
        vtrn.8          d18, d16
579
        vtrn.8          d0,  d2
580

    
581
        sub             r0,  r0,  r1, lsl #3
582
        vst1.32         {d18[0]}, [r0], r1
583
        vst1.32         {d16[0]}, [r0], r1
584
        vst1.32         {d0[0]},  [r0], r1
585
        vst1.32         {d2[0]},  [r0], r1
586
        vst1.32         {d18[1]}, [r0], r1
587
        vst1.32         {d16[1]}, [r0], r1
588
        vst1.32         {d0[1]},  [r0], r1
589
        vst1.32         {d2[1]},  [r0], r1
590

    
591
        bx              lr
592
        .endfunc
593

    
594
        /* H.264 qpel MC */
595

    
596
        .macro  lowpass_const r
597
        movw            \r,  #5
598
        movt            \r,  #20
599
        vmov.32         d6[0], \r
600
        .endm
601

    
602
        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
603
.if \narrow
604
        t0 .req q0
605
        t1 .req q8
606
.else
607
        t0 .req \d0
608
        t1 .req \d1
609
.endif
610
        vext.8          d2,  \r0, \r1, #2
611
        vext.8          d3,  \r0, \r1, #3
612
        vaddl.u8        q1,  d2,  d3
613
        vext.8          d4,  \r0, \r1, #1
614
        vext.8          d5,  \r0, \r1, #4
615
        vaddl.u8        q2,  d4,  d5
616
        vext.8          d30, \r0, \r1, #5
617
        vaddl.u8        t0,  \r0, d30
618
        vext.8          d18, \r2, \r3, #2
619
        vmla.i16        t0,  q1,  d6[1]
620
        vext.8          d19, \r2, \r3, #3
621
        vaddl.u8        q9,  d18, d19
622
        vext.8          d20, \r2, \r3, #1
623
        vmls.i16        t0,  q2,  d6[0]
624
        vext.8          d21, \r2, \r3, #4
625
        vaddl.u8        q10, d20, d21
626
        vext.8          d31, \r2, \r3, #5
627
        vaddl.u8        t1,  \r2, d31
628
        vmla.i16        t1,  q9,  d6[1]
629
        vmls.i16        t1,  q10, d6[0]
630
.if \narrow
631
        vqrshrun.s16    \d0, t0,  #5
632
        vqrshrun.s16    \d1, t1,  #5
633
.endif
634
        .unreq  t0
635
        .unreq  t1
636
        .endm
637

    
638
        .macro  lowpass_8_1 r0, r1, d0, narrow=1
639
.if \narrow
640
        t0 .req q0
641
.else
642
        t0 .req \d0
643
.endif
644
        vext.8          d2,  \r0, \r1, #2
645
        vext.8          d3,  \r0, \r1, #3
646
        vaddl.u8        q1,  d2,  d3
647
        vext.8          d4,  \r0, \r1, #1
648
        vext.8          d5,  \r0, \r1, #4
649
        vaddl.u8        q2,  d4,  d5
650
        vext.8          d30, \r0, \r1, #5
651
        vaddl.u8        t0,  \r0, d30
652
        vmla.i16        t0,  q1,  d6[1]
653
        vmls.i16        t0,  q2,  d6[0]
654
.if \narrow
655
        vqrshrun.s16    \d0, t0,  #5
656
.endif
657
        .unreq  t0
658
        .endm
659

    
660
        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661
        vext.16         q1,  \r0, \r1, #2
662
        vext.16         q0,  \r0, \r1, #3
663
        vaddl.s16       q9,  d2,  d0
664
        vext.16         q2,  \r0, \r1, #1
665
        vaddl.s16       q1,  d3,  d1
666
        vext.16         q3,  \r0, \r1, #4
667
        vaddl.s16       q10, d4,  d6
668
        vext.16         \r1, \r0, \r1, #5
669
        vaddl.s16       q2,  d5,  d7
670
        vaddl.s16       q0,  \h0, \h1
671
        vaddl.s16       q8,  \l0, \l1
672

    
673
        vshl.i32        q3,  q9,  #4
674
        vshl.i32        q9,  q9,  #2
675
        vshl.i32        q15, q10, #2
676
        vadd.i32        q9,  q9,  q3
677
        vadd.i32        q10, q10, q15
678

    
679
        vshl.i32        q3,  q1,  #4
680
        vshl.i32        q1,  q1,  #2
681
        vshl.i32        q15, q2,  #2
682
        vadd.i32        q1,  q1,  q3
683
        vadd.i32        q2,  q2,  q15
684

    
685
        vadd.i32        q9,  q9,  q8
686
        vsub.i32        q9,  q9,  q10
687

    
688
        vadd.i32        q1,  q1,  q0
689
        vsub.i32        q1,  q1,  q2
690

    
691
        vrshrn.s32      d18, q9,  #10
692
        vrshrn.s32      d19, q1,  #10
693

    
694
        vqmovun.s16     \d,  q9
695
        .endm
696

    
697
function put_h264_qpel16_h_lowpass_neon_packed
698
        mov             r4,  lr
699
        mov             ip,  #16
700
        mov             r3,  #8
701
        bl              put_h264_qpel8_h_lowpass_neon
702
        sub             r1,  r1,  r2, lsl #4
703
        add             r1,  r1,  #8
704
        mov             ip,  #16
705
        mov             lr,  r4
706
        b               put_h264_qpel8_h_lowpass_neon
707
        .endfunc
708

    
709
function put_h264_qpel16_h_lowpass_neon
710
        push            {lr}
711
        mov             ip,  #16
712
        bl              put_h264_qpel8_h_lowpass_neon
713
        sub             r0,  r0,  r3, lsl #4
714
        sub             r1,  r1,  r2, lsl #4
715
        add             r0,  r0,  #8
716
        add             r1,  r1,  #8
717
        mov             ip,  #16
718
        pop             {lr}
719
        .endfunc
720

    
721
function put_h264_qpel8_h_lowpass_neon
722
1:      vld1.64         {d0, d1},  [r1], r2
723
        vld1.64         {d16,d17}, [r1], r2
724
        subs            ip,  ip,  #2
725
        lowpass_8       d0,  d1,  d16, d17, d0,  d16
726
        vst1.64         {d0},     [r0,:64], r3
727
        vst1.64         {d16},    [r0,:64], r3
728
        bne             1b
729
        bx              lr
730
        .endfunc
731

    
732
function put_h264_qpel16_h_lowpass_l2_neon
733
        push            {lr}
734
        mov             ip,  #16
735
        bl              put_h264_qpel8_h_lowpass_l2_neon
736
        sub             r0,  r0,  r2, lsl #4
737
        sub             r1,  r1,  r2, lsl #4
738
        sub             r3,  r3,  r2, lsl #4
739
        add             r0,  r0,  #8
740
        add             r1,  r1,  #8
741
        add             r3,  r3,  #8
742
        mov             ip,  #16
743
        pop             {lr}
744
        .endfunc
745

    
746
function put_h264_qpel8_h_lowpass_l2_neon
747
1:      vld1.64         {d0, d1},  [r1], r2
748
        vld1.64         {d16,d17}, [r1], r2
749
        vld1.64         {d28},     [r3], r2
750
        vld1.64         {d29},     [r3], r2
751
        subs            ip,  ip,  #2
752
        lowpass_8       d0,  d1,  d16, d17, d0,  d1
753
        vrhadd.u8       q0,  q0,  q14
754
        vst1.64         {d0},      [r0,:64], r2
755
        vst1.64         {d1},      [r0,:64], r2
756
        bne             1b
757
        bx              lr
758
        .endfunc
759

    
760
function put_h264_qpel16_v_lowpass_neon_packed
761
        mov             r4,  lr
762
        mov             r2,  #8
763
        bl              put_h264_qpel8_v_lowpass_neon
764
        sub             r1,  r1,  r3, lsl #2
765
        bl              put_h264_qpel8_v_lowpass_neon
766
        sub             r1,  r1,  r3, lsl #4
767
        sub             r1,  r1,  r3, lsl #2
768
        add             r1,  r1,  #8
769
        bl              put_h264_qpel8_v_lowpass_neon
770
        sub             r1,  r1,  r3, lsl #2
771
        mov             lr,  r4
772
        b               put_h264_qpel8_v_lowpass_neon
773
        .endfunc
774

    
775
function put_h264_qpel16_v_lowpass_neon
776
        mov             r4,  lr
777
        bl              put_h264_qpel8_v_lowpass_neon
778
        sub             r1,  r1,  r3, lsl #2
779
        bl              put_h264_qpel8_v_lowpass_neon
780
        sub             r0,  r0,  r2, lsl #4
781
        add             r0,  r0,  #8
782
        sub             r1,  r1,  r3, lsl #4
783
        sub             r1,  r1,  r3, lsl #2
784
        add             r1,  r1,  #8
785
        bl              put_h264_qpel8_v_lowpass_neon
786
        sub             r1,  r1,  r3, lsl #2
787
        mov             lr,  r4
788
        .endfunc
789

    
790
function put_h264_qpel8_v_lowpass_neon
791
        vld1.64         {d8},  [r1], r3
792
        vld1.64         {d10}, [r1], r3
793
        vld1.64         {d12}, [r1], r3
794
        vld1.64         {d14}, [r1], r3
795
        vld1.64         {d22}, [r1], r3
796
        vld1.64         {d24}, [r1], r3
797
        vld1.64         {d26}, [r1], r3
798
        vld1.64         {d28}, [r1], r3
799
        vld1.64         {d9},  [r1], r3
800
        vld1.64         {d11}, [r1], r3
801
        vld1.64         {d13}, [r1], r3
802
        vld1.64         {d15}, [r1], r3
803
        vld1.64         {d23}, [r1]
804

    
805
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
806
        lowpass_8       d8,  d9,  d10, d11, d8,  d10
807
        lowpass_8       d12, d13, d14, d15, d12, d14
808
        lowpass_8       d22, d23, d24, d25, d22, d24
809
        lowpass_8       d26, d27, d28, d29, d26, d28
810
        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
811

    
812
        vst1.64         {d8},  [r0,:64], r2
813
        vst1.64         {d10}, [r0,:64], r2
814
        vst1.64         {d12}, [r0,:64], r2
815
        vst1.64         {d14}, [r0,:64], r2
816
        vst1.64         {d22}, [r0,:64], r2
817
        vst1.64         {d24}, [r0,:64], r2
818
        vst1.64         {d26}, [r0,:64], r2
819
        vst1.64         {d28}, [r0,:64], r2
820

    
821
        bx              lr
822
        .endfunc
823

    
824
function put_h264_qpel16_v_lowpass_l2_neon
825
        mov             r4,  lr
826
        bl              put_h264_qpel8_v_lowpass_l2_neon
827
        sub             r1,  r1,  r3, lsl #2
828
        bl              put_h264_qpel8_v_lowpass_l2_neon
829
        sub             r0,  r0,  r3, lsl #4
830
        sub             ip,  ip,  r2, lsl #4
831
        add             r0,  r0,  #8
832
        add             ip,  ip,  #8
833
        sub             r1,  r1,  r3, lsl #4
834
        sub             r1,  r1,  r3, lsl #2
835
        add             r1,  r1,  #8
836
        bl              put_h264_qpel8_v_lowpass_l2_neon
837
        sub             r1,  r1,  r3, lsl #2
838
        mov             lr,  r4
839
        .endfunc
840

    
841
function put_h264_qpel8_v_lowpass_l2_neon
842
        vld1.64         {d8},  [r1], r3
843
        vld1.64         {d10}, [r1], r3
844
        vld1.64         {d12}, [r1], r3
845
        vld1.64         {d14}, [r1], r3
846
        vld1.64         {d22}, [r1], r3
847
        vld1.64         {d24}, [r1], r3
848
        vld1.64         {d26}, [r1], r3
849
        vld1.64         {d28}, [r1], r3
850
        vld1.64         {d9},  [r1], r3
851
        vld1.64         {d11}, [r1], r3
852
        vld1.64         {d13}, [r1], r3
853
        vld1.64         {d15}, [r1], r3
854
        vld1.64         {d23}, [r1]
855

    
856
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
857
        lowpass_8       d8,  d9,  d10, d11, d8,  d9
858
        lowpass_8       d12, d13, d14, d15, d12, d13
859
        lowpass_8       d22, d23, d24, d25, d22, d23
860
        lowpass_8       d26, d27, d28, d29, d26, d27
861
        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
862

    
863
        vld1.64         {d0},  [ip], r2
864
        vld1.64         {d1},  [ip], r2
865
        vld1.64         {d2},  [ip], r2
866
        vld1.64         {d3},  [ip], r2
867
        vld1.64         {d4},  [ip], r2
868
        vrhadd.u8       q0,  q0,  q4
869
        vld1.64         {d5},  [ip], r2
870
        vrhadd.u8       q1,  q1,  q6
871
        vld1.64         {d10}, [ip], r2
872
        vrhadd.u8       q2,  q2,  q11
873
        vld1.64         {d11}, [ip], r2
874

    
875
        vst1.64         {d0},  [r0,:64], r3
876
        vst1.64         {d1},  [r0,:64], r3
877
        vrhadd.u8       q5,  q5,  q13
878
        vst1.64         {d2},  [r0,:64], r3
879
        vst1.64         {d3},  [r0,:64], r3
880
        vst1.64         {d4},  [r0,:64], r3
881
        vst1.64         {d5},  [r0,:64], r3
882
        vst1.64         {d10}, [r0,:64], r3
883
        vst1.64         {d11}, [r0,:64], r3
884

    
885
        bx              lr
886
        .endfunc
887

    
888
function put_h264_qpel8_hv_lowpass_neon_top
889
        lowpass_const   ip
890
        mov             ip,  #12
891
1:      vld1.64         {d0, d1},  [r1], r3
892
        vld1.64         {d16,d17}, [r1], r3
893
        subs            ip,  ip,  #2
894
        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
895
        vst1.64         {d22-d25}, [r4,:128]!
896
        bne             1b
897

    
898
        vld1.64         {d0, d1},  [r1]
899
        lowpass_8_1     d0,  d1,  q12, narrow=0
900

    
901
        mov             ip,  #-16
902
        add             r4,  r4,  ip
903
        vld1.64         {d30,d31}, [r4,:128], ip
904
        vld1.64         {d20,d21}, [r4,:128], ip
905
        vld1.64         {d18,d19}, [r4,:128], ip
906
        vld1.64         {d16,d17}, [r4,:128], ip
907
        vld1.64         {d14,d15}, [r4,:128], ip
908
        vld1.64         {d12,d13}, [r4,:128], ip
909
        vld1.64         {d10,d11}, [r4,:128], ip
910
        vld1.64         {d8, d9},  [r4,:128], ip
911
        vld1.64         {d6, d7},  [r4,:128], ip
912
        vld1.64         {d4, d5},  [r4,:128], ip
913
        vld1.64         {d2, d3},  [r4,:128], ip
914
        vld1.64         {d0, d1},  [r4,:128]
915

    
916
        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
917
        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
918

    
919
        swap4           d17, d19, d21, d31, d24, d26, d28, d22
920
        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
921

    
922
        vst1.64         {d30,d31}, [r4,:128]!
923
        vst1.64         {d6, d7},  [r4,:128]!
924
        vst1.64         {d20,d21}, [r4,:128]!
925
        vst1.64         {d4, d5},  [r4,:128]!
926
        vst1.64         {d18,d19}, [r4,:128]!
927
        vst1.64         {d2, d3},  [r4,:128]!
928
        vst1.64         {d16,d17}, [r4,:128]!
929
        vst1.64         {d0, d1},  [r4,:128]
930

    
931
        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
932
        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
933
        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
934
        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
935

    
936
        vld1.64         {d16,d17}, [r4,:128], ip
937
        vld1.64         {d30,d31}, [r4,:128], ip
938
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
939
        vld1.64         {d16,d17}, [r4,:128], ip
940
        vld1.64         {d30,d31}, [r4,:128], ip
941
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
942
        vld1.64         {d16,d17}, [r4,:128], ip
943
        vld1.64         {d30,d31}, [r4,:128], ip
944
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
945
        vld1.64         {d16,d17}, [r4,:128], ip
946
        vld1.64         {d30,d31}, [r4,:128]
947
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
948

    
949
        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
950

    
951
        bx              lr
952
        .endfunc
953

    
954
function put_h264_qpel8_hv_lowpass_neon
955
        mov             r10, lr
956
        bl              put_h264_qpel8_hv_lowpass_neon_top
957
        vst1.64         {d12},     [r0,:64], r2
958
        vst1.64         {d13},     [r0,:64], r2
959
        vst1.64         {d14},     [r0,:64], r2
960
        vst1.64         {d15},     [r0,:64], r2
961
        vst1.64         {d8},      [r0,:64], r2
962
        vst1.64         {d9},      [r0,:64], r2
963
        vst1.64         {d10},     [r0,:64], r2
964
        vst1.64         {d11},     [r0,:64], r2
965

    
966
        mov             lr,  r10
967
        bx              lr
968
        .endfunc
969

    
970
function put_h264_qpel8_hv_lowpass_l2_neon
971
        mov             r10, lr
972
        bl              put_h264_qpel8_hv_lowpass_neon_top
973

    
974
        vld1.64         {d0, d1},  [r2,:128]!
975
        vld1.64         {d2, d3},  [r2,:128]!
976
        vrhadd.u8       q0,  q0,  q6
977
        vld1.64         {d4, d5},  [r2,:128]!
978
        vrhadd.u8       q1,  q1,  q7
979
        vld1.64         {d6, d7},  [r2,:128]!
980
        vrhadd.u8       q2,  q2,  q4
981

    
982
        vst1.64         {d0},      [r0,:64], r3
983
        vrhadd.u8       q3,  q3,  q5
984
        vst1.64         {d1},      [r0,:64], r3
985
        vst1.64         {d2},      [r0,:64], r3
986
        vst1.64         {d3},      [r0,:64], r3
987
        vst1.64         {d4},      [r0,:64], r3
988
        vst1.64         {d5},      [r0,:64], r3
989
        vst1.64         {d6},      [r0,:64], r3
990
        vst1.64         {d7},      [r0,:64], r3
991

    
992
        mov             lr,  r10
993
        bx              lr
994
        .endfunc
995

    
996
function put_h264_qpel16_hv_lowpass_neon
997
        mov             r9,  lr
998
        bl              put_h264_qpel8_hv_lowpass_neon
999
        sub             r1,  r1,  r3, lsl #2
1000
        bl              put_h264_qpel8_hv_lowpass_neon
1001
        sub             r1,  r1,  r3, lsl #4
1002
        sub             r1,  r1,  r3, lsl #2
1003
        add             r1,  r1,  #8
1004
        sub             r0,  r0,  r2, lsl #4
1005
        add             r0,  r0,  #8
1006
        bl              put_h264_qpel8_hv_lowpass_neon
1007
        sub             r1,  r1,  r3, lsl #2
1008
        mov             lr,  r9
1009
        b               put_h264_qpel8_hv_lowpass_neon
1010
        .endfunc
1011

    
1012
function put_h264_qpel16_hv_lowpass_l2_neon
1013
        mov             r9,  lr
1014
        sub             r2,  r4,  #256
1015
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1016
        sub             r1,  r1,  r3, lsl #2
1017
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1018
        sub             r1,  r1,  r3, lsl #4
1019
        sub             r1,  r1,  r3, lsl #2
1020
        add             r1,  r1,  #8
1021
        sub             r0,  r0,  r3, lsl #4
1022
        add             r0,  r0,  #8
1023
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1024
        sub             r1,  r1,  r3, lsl #2
1025
        mov             lr,  r9
1026
        b               put_h264_qpel8_hv_lowpass_l2_neon
1027
        .endfunc
1028

    
1029
function ff_put_h264_qpel8_mc10_neon, export=1
1030
        lowpass_const   r3
1031
        mov             r3,  r1
1032
        sub             r1,  r1,  #2
1033
        mov             ip,  #8
1034
        b               put_h264_qpel8_h_lowpass_l2_neon
1035
        .endfunc
1036

    
1037
function ff_put_h264_qpel8_mc20_neon, export=1
1038
        lowpass_const   r3
1039
        sub             r1,  r1,  #2
1040
        mov             r3,  r2
1041
        mov             ip,  #8
1042
        b               put_h264_qpel8_h_lowpass_neon
1043
        .endfunc
1044

    
1045
function ff_put_h264_qpel8_mc30_neon, export=1
1046
        lowpass_const   r3
1047
        add             r3,  r1,  #1
1048
        sub             r1,  r1,  #2
1049
        mov             ip,  #8
1050
        b               put_h264_qpel8_h_lowpass_l2_neon
1051
        .endfunc
1052

    
1053
function ff_put_h264_qpel8_mc01_neon, export=1
1054
        push            {lr}
1055
        mov             ip,  r1
1056
put_h264_qpel8_mc01:
1057
        lowpass_const   r3
1058
        mov             r3,  r2
1059
        sub             r1,  r1,  r2, lsl #1
1060
        vpush           {d8-d15}
1061
        bl              put_h264_qpel8_v_lowpass_l2_neon
1062
        vpop            {d8-d15}
1063
        pop             {pc}
1064
        .endfunc
1065

    
1066
function ff_put_h264_qpel8_mc11_neon, export=1
1067
        push            {r0, r1, r11, lr}
1068
put_h264_qpel8_mc11:
1069
        lowpass_const   r3
1070
        mov             r11, sp
1071
        bic             sp,  sp,  #15
1072
        sub             sp,  sp,  #64
1073
        mov             r0,  sp
1074
        sub             r1,  r1,  #2
1075
        mov             r3,  #8
1076
        mov             ip,  #8
1077
        vpush           {d8-d15}
1078
        bl              put_h264_qpel8_h_lowpass_neon
1079
        ldrd            r0,  [r11]
1080
        mov             r3,  r2
1081
        add             ip,  sp,  #64
1082
        sub             r1,  r1,  r2, lsl #1
1083
        mov             r2,  #8
1084
        bl              put_h264_qpel8_v_lowpass_l2_neon
1085
        vpop            {d8-d15}
1086
        add             sp,  r11, #8
1087
        pop             {r11, pc}
1088
        .endfunc
1089

    
1090
function ff_put_h264_qpel8_mc21_neon, export=1
1091
        push            {r0, r1, r4, r10, r11, lr}
1092
put_h264_qpel8_mc21:
1093
        lowpass_const   r3
1094
        mov             r11, sp
1095
        bic             sp,  sp,  #15
1096
        sub             sp,  sp,  #(8*8+16*12)
1097
        sub             r1,  r1,  #2
1098
        mov             r3,  #8
1099
        mov             r0,  sp
1100
        mov             ip,  #8
1101
        vpush           {d8-d15}
1102
        bl              put_h264_qpel8_h_lowpass_neon
1103
        mov             r4,  r0
1104
        ldrd            r0,  [r11]
1105
        sub             r1,  r1,  r2, lsl #1
1106
        sub             r1,  r1,  #2
1107
        mov             r3,  r2
1108
        sub             r2,  r4,  #64
1109
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1110
        vpop            {d8-d15}
1111
        add             sp,  r11,  #8
1112
        pop             {r4, r10, r11, pc}
1113
        .endfunc
1114

    
1115
function ff_put_h264_qpel8_mc31_neon, export=1
1116
        add             r1,  r1,  #1
1117
        push            {r0, r1, r11, lr}
1118
        sub             r1,  r1,  #1
1119
        b               put_h264_qpel8_mc11
1120
        .endfunc
1121

    
1122
function ff_put_h264_qpel8_mc02_neon, export=1
1123
        push            {lr}
1124
        lowpass_const   r3
1125
        sub             r1,  r1,  r2, lsl #1
1126
        mov             r3,  r2
1127
        vpush           {d8-d15}
1128
        bl              put_h264_qpel8_v_lowpass_neon
1129
        vpop            {d8-d15}
1130
        pop             {pc}
1131
        .endfunc
1132

    
1133
function ff_put_h264_qpel8_mc12_neon, export=1
1134
        push            {r0, r1, r4, r10, r11, lr}
1135
put_h264_qpel8_mc12:
1136
        lowpass_const   r3
1137
        mov             r11, sp
1138
        bic             sp,  sp,  #15
1139
        sub             sp,  sp,  #(8*8+16*12)
1140
        sub             r1,  r1,  r2, lsl #1
1141
        mov             r3,  r2
1142
        mov             r2,  #8
1143
        mov             r0,  sp
1144
        vpush           {d8-d15}
1145
        bl              put_h264_qpel8_v_lowpass_neon
1146
        mov             r4,  r0
1147
        ldrd            r0,  [r11]
1148
        sub             r1,  r1,  r3, lsl #1
1149
        sub             r1,  r1,  #2
1150
        sub             r2,  r4,  #64
1151
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1152
        vpop            {d8-d15}
1153
        add             sp,  r11,  #8
1154
        pop             {r4, r10, r11, pc}
1155
        .endfunc
1156

    
1157
function ff_put_h264_qpel8_mc22_neon, export=1
1158
        push            {r4, r10, r11, lr}
1159
        mov             r11, sp
1160
        bic             sp,  sp,  #15
1161
        sub             r1,  r1,  r2, lsl #1
1162
        sub             r1,  r1,  #2
1163
        mov             r3,  r2
1164
        sub             sp,  sp,  #(16*12)
1165
        mov             r4,  sp
1166
        vpush           {d8-d15}
1167
        bl              put_h264_qpel8_hv_lowpass_neon
1168
        vpop            {d8-d15}
1169
        mov             sp,  r11
1170
        pop             {r4, r10, r11, pc}
1171
        .endfunc
1172

    
1173
function ff_put_h264_qpel8_mc32_neon, export=1
1174
        push            {r0, r1, r4, r10, r11, lr}
1175
        add             r1,  r1,  #1
1176
        b               put_h264_qpel8_mc12
1177
        .endfunc
1178

    
1179
function ff_put_h264_qpel8_mc03_neon, export=1
1180
        push            {lr}
1181
        add             ip,  r1,  r2
1182
        b               put_h264_qpel8_mc01
1183
        .endfunc
1184

    
1185
function ff_put_h264_qpel8_mc13_neon, export=1
1186
        push            {r0, r1, r11, lr}
1187
        add             r1,  r1,  r2
1188
        b               put_h264_qpel8_mc11
1189
        .endfunc
1190

    
1191
function ff_put_h264_qpel8_mc23_neon, export=1
1192
        push            {r0, r1, r4, r10, r11, lr}
1193
        add             r1,  r1,  r2
1194
        b               put_h264_qpel8_mc21
1195
        .endfunc
1196

    
1197
function ff_put_h264_qpel8_mc33_neon, export=1
1198
        add             r1,  r1,  #1
1199
        push            {r0, r1, r11, lr}
1200
        add             r1,  r1,  r2
1201
        sub             r1,  r1,  #1
1202
        b               put_h264_qpel8_mc11
1203
        .endfunc
1204

    
1205
function ff_put_h264_qpel16_mc10_neon, export=1
1206
        lowpass_const   r3
1207
        mov             r3,  r1
1208
        sub             r1,  r1,  #2
1209
        b               put_h264_qpel16_h_lowpass_l2_neon
1210
        .endfunc
1211

    
1212
function ff_put_h264_qpel16_mc20_neon, export=1
1213
        lowpass_const   r3
1214
        sub             r1,  r1,  #2
1215
        mov             r3,  r2
1216
        b               put_h264_qpel16_h_lowpass_neon
1217
        .endfunc
1218

    
1219
function ff_put_h264_qpel16_mc30_neon, export=1
1220
        lowpass_const   r3
1221
        add             r3,  r1,  #1
1222
        sub             r1,  r1,  #2
1223
        b               put_h264_qpel16_h_lowpass_l2_neon
1224
        .endfunc
1225

    
1226
function ff_put_h264_qpel16_mc01_neon, export=1
1227
        push            {r4, lr}
1228
        mov             ip,  r1
1229
put_h264_qpel16_mc01:
1230
        lowpass_const   r3
1231
        mov             r3,  r2
1232
        sub             r1,  r1,  r2, lsl #1
1233
        vpush           {d8-d15}
1234
        bl              put_h264_qpel16_v_lowpass_l2_neon
1235
        vpop            {d8-d15}
1236
        pop             {r4, pc}
1237
        .endfunc
1238

    
1239
function ff_put_h264_qpel16_mc11_neon, export=1
1240
        push            {r0, r1, r4, r11, lr}
1241
put_h264_qpel16_mc11:
1242
        lowpass_const   r3
1243
        mov             r11, sp
1244
        bic             sp,  sp,  #15
1245
        sub             sp,  sp,  #256
1246
        mov             r0,  sp
1247
        sub             r1,  r1,  #2
1248
        mov             r3,  #16
1249
        vpush           {d8-d15}
1250
        bl              put_h264_qpel16_h_lowpass_neon
1251
        ldrd            r0,  [r11]
1252
        mov             r3,  r2
1253
        add             ip,  sp,  #64
1254
        sub             r1,  r1,  r2, lsl #1
1255
        mov             r2,  #16
1256
        bl              put_h264_qpel16_v_lowpass_l2_neon
1257
        vpop            {d8-d15}
1258
        add             sp,  r11, #8
1259
        pop             {r4, r11, pc}
1260
        .endfunc
1261

    
1262
function ff_put_h264_qpel16_mc21_neon, export=1
1263
        push            {r0, r1, r4-r5, r9-r11, lr}
1264
put_h264_qpel16_mc21:
1265
        lowpass_const   r3
1266
        mov             r11, sp
1267
        bic             sp,  sp,  #15
1268
        sub             sp,  sp,  #(16*16+16*12)
1269
        sub             r1,  r1,  #2
1270
        mov             r0,  sp
1271
        vpush           {d8-d15}
1272
        bl              put_h264_qpel16_h_lowpass_neon_packed
1273
        mov             r4,  r0
1274
        ldrd            r0,  [r11]
1275
        sub             r1,  r1,  r2, lsl #1
1276
        sub             r1,  r1,  #2
1277
        mov             r3,  r2
1278
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1279
        vpop            {d8-d15}
1280
        add             sp,  r11,  #8
1281
        pop             {r4-r5, r9-r11, pc}
1282
        .endfunc
1283

    
1284
function ff_put_h264_qpel16_mc31_neon, export=1
1285
        add             r1,  r1,  #1
1286
        push            {r0, r1, r4, r11, lr}
1287
        sub             r1,  r1,  #1
1288
        b               put_h264_qpel16_mc11
1289
        .endfunc
1290

    
1291
function ff_put_h264_qpel16_mc02_neon, export=1
1292
        push            {r4, lr}
1293
        lowpass_const   r3
1294
        sub             r1,  r1,  r2, lsl #1
1295
        mov             r3,  r2
1296
        vpush           {d8-d15}
1297
        bl              put_h264_qpel16_v_lowpass_neon
1298
        vpop            {d8-d15}
1299
        pop             {r4, pc}
1300
        .endfunc
1301

    
1302
function ff_put_h264_qpel16_mc12_neon, export=1
1303
        push            {r0, r1, r4-r5, r9-r11, lr}
1304
put_h264_qpel16_mc12:
1305
        lowpass_const   r3
1306
        mov             r11, sp
1307
        bic             sp,  sp,  #15
1308
        sub             sp,  sp,  #(16*16+16*12)
1309
        sub             r1,  r1,  r2, lsl #1
1310
        mov             r0,  sp
1311
        mov             r3,  r2
1312
        vpush           {d8-d15}
1313
        bl              put_h264_qpel16_v_lowpass_neon_packed
1314
        mov             r4,  r0
1315
        ldrd            r0,  [r11]
1316
        sub             r1,  r1,  r3, lsl #1
1317
        sub             r1,  r1,  #2
1318
        mov             r2,  r3
1319
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1320
        vpop            {d8-d15}
1321
        add             sp,  r11,  #8
1322
        pop             {r4-r5, r9-r11, pc}
1323
        .endfunc
1324

    
1325
function ff_put_h264_qpel16_mc22_neon, export=1
1326
        push            {r4, r9-r11, lr}
1327
        lowpass_const   r3
1328
        mov             r11, sp
1329
        bic             sp,  sp,  #15
1330
        sub             r1,  r1,  r2, lsl #1
1331
        sub             r1,  r1,  #2
1332
        mov             r3,  r2
1333
        sub             sp,  sp,  #(16*12)
1334
        mov             r4,  sp
1335
        vpush           {d8-d15}
1336
        bl              put_h264_qpel16_hv_lowpass_neon
1337
        vpop            {d8-d15}
1338
        mov             sp,  r11
1339
        pop             {r4, r9-r11, pc}
1340
        .endfunc
1341

    
1342
function ff_put_h264_qpel16_mc32_neon, export=1
1343
        push            {r0, r1, r4-r5, r9-r11, lr}
1344
        add             r1,  r1,  #1
1345
        b               put_h264_qpel16_mc12
1346
        .endfunc
1347

    
1348
function ff_put_h264_qpel16_mc03_neon, export=1
1349
        push            {r4, lr}
1350
        add             ip,  r1,  r2
1351
        b               put_h264_qpel16_mc01
1352
        .endfunc
1353

    
1354
function ff_put_h264_qpel16_mc13_neon, export=1
1355
        push            {r0, r1, r4, r11, lr}
1356
        add             r1,  r1,  r2
1357
        b               put_h264_qpel16_mc11
1358
        .endfunc
1359

    
1360
function ff_put_h264_qpel16_mc23_neon, export=1
1361
        push            {r0, r1, r4-r5, r9-r11, lr}
1362
        add             r1,  r1,  r2
1363
        b               put_h264_qpel16_mc21
1364
        .endfunc
1365

    
1366
function ff_put_h264_qpel16_mc33_neon, export=1
1367
        add             r1,  r1,  #1
1368
        push            {r0, r1, r4, r11, lr}
1369
        add             r1,  r1,  r2
1370
        sub             r1,  r1,  #1
1371
        b               put_h264_qpel16_mc11
1372
        .endfunc
1373

    
1374
@ Biweighted prediction
1375

    
1376
        .macro  biweight_16 macs, macd
1377
        vdup.8          d0,  r4
1378
        vdup.8          d1,  r5
1379
        vmov            q2,  q8
1380
        vmov            q3,  q8
1381
1:      subs            ip,  ip,  #2
1382
        vld1.8          {d20-d21},[r0,:128], r2
1383
        \macd           q2,  d0,  d20
1384
        pld             [r0]
1385
        \macd           q3,  d0,  d21
1386
        vld1.8          {d22-d23},[r1,:128], r2
1387
        \macs           q2,  d1,  d22
1388
        pld             [r1]
1389
        \macs           q3,  d1,  d23
1390
        vmov            q12, q8
1391
        vld1.8          {d28-d29},[r0,:128], r2
1392
        vmov            q13, q8
1393
        \macd           q12, d0,  d28
1394
        pld             [r0]
1395
        \macd           q13, d0,  d29
1396
        vld1.8          {d30-d31},[r1,:128], r2
1397
        \macs           q12, d1,  d30
1398
        pld             [r1]
1399
        \macs           q13, d1,  d31
1400
        vshl.s16        q2,  q2,  q9
1401
        vshl.s16        q3,  q3,  q9
1402
        vqmovun.s16     d4,  q2
1403
        vqmovun.s16     d5,  q3
1404
        vshl.s16        q12, q12, q9
1405
        vshl.s16        q13, q13, q9
1406
        vqmovun.s16     d24, q12
1407
        vqmovun.s16     d25, q13
1408
        vmov            q3,  q8
1409
        vst1.8          {d4- d5}, [r6,:128], r2
1410
        vmov            q2,  q8
1411
        vst1.8          {d24-d25},[r6,:128], r2
1412
        bne             1b
1413
        pop             {r4-r6, pc}
1414
        .endm
1415

    
1416
        .macro  biweight_8 macs, macd
1417
        vdup.8          d0,  r4
1418
        vdup.8          d1,  r5
1419
        vmov            q1,  q8
1420
        vmov            q10, q8
1421
1:      subs            ip,  ip,  #2
1422
        vld1.8          {d4},[r0,:64], r2
1423
        \macd           q1,  d0,  d4
1424
        pld             [r0]
1425
        vld1.8          {d5},[r1,:64], r2
1426
        \macs           q1,  d1,  d5
1427
        pld             [r1]
1428
        vld1.8          {d6},[r0,:64], r2
1429
        \macd           q10, d0,  d6
1430
        pld             [r0]
1431
        vld1.8          {d7},[r1,:64], r2
1432
        \macs           q10, d1,  d7
1433
        pld             [r1]
1434
        vshl.s16        q1,  q1,  q9
1435
        vqmovun.s16     d2,  q1
1436
        vshl.s16        q10, q10, q9
1437
        vqmovun.s16     d4,  q10
1438
        vmov            q10, q8
1439
        vst1.8          {d2},[r6,:64], r2
1440
        vmov            q1,  q8
1441
        vst1.8          {d4},[r6,:64], r2
1442
        bne             1b
1443
        pop             {r4-r6, pc}
1444
        .endm
1445

    
1446
        .macro  biweight_4 macs, macd
1447
        vdup.8          d0,  r4
1448
        vdup.8          d1,  r5
1449
        vmov            q1,  q8
1450
        vmov            q10, q8
1451
1:      subs            ip,  ip,  #4
1452
        vld1.32         {d4[0]},[r0,:32], r2
1453
        vld1.32         {d4[1]},[r0,:32], r2
1454
        \macd           q1,  d0,  d4
1455
        pld             [r0]
1456
        vld1.32         {d5[0]},[r1,:32], r2
1457
        vld1.32         {d5[1]},[r1,:32], r2
1458
        \macs           q1,  d1,  d5
1459
        pld             [r1]
1460
        blt             2f
1461
        vld1.32         {d6[0]},[r0,:32], r2
1462
        vld1.32         {d6[1]},[r0,:32], r2
1463
        \macd           q10, d0,  d6
1464
        pld             [r0]
1465
        vld1.32         {d7[0]},[r1,:32], r2
1466
        vld1.32         {d7[1]},[r1,:32], r2
1467
        \macs           q10, d1,  d7
1468
        pld             [r1]
1469
        vshl.s16        q1,  q1,  q9
1470
        vqmovun.s16     d2,  q1
1471
        vshl.s16        q10, q10, q9
1472
        vqmovun.s16     d4,  q10
1473
        vmov            q10, q8
1474
        vst1.32         {d2[0]},[r6,:32], r2
1475
        vst1.32         {d2[1]},[r6,:32], r2
1476
        vmov            q1,  q8
1477
        vst1.32         {d4[0]},[r6,:32], r2
1478
        vst1.32         {d4[1]},[r6,:32], r2
1479
        bne             1b
1480
        pop             {r4-r6, pc}
1481
2:      vshl.s16        q1,  q1,  q9
1482
        vqmovun.s16     d2,  q1
1483
        vst1.32         {d2[0]},[r6,:32], r2
1484
        vst1.32         {d2[1]},[r6,:32], r2
1485
        pop             {r4-r6, pc}
1486
        .endm
1487

    
1488
        .macro  biweight_func w
1489
function biweight_h264_pixels_\w\()_neon
1490
        push            {r4-r6, lr}
1491
        add             r4,  sp,  #16
1492
        ldm             r4,  {r4-r6}
1493
        lsr             lr,  r4,  #31
1494
        add             r6,  r6,  #1
1495
        eors            lr,  lr,  r5,  lsr #30
1496
        orr             r6,  r6,  #1
1497
        vdup.16         q9,  r3
1498
        lsl             r6,  r6,  r3
1499
        vmvn            q9,  q9
1500
        vdup.16         q8,  r6
1501
        mov             r6,  r0
1502
        beq             10f
1503
        subs            lr,  lr,  #1
1504
        beq             20f
1505
        subs            lr,  lr,  #1
1506
        beq             30f
1507
        b               40f
1508
10:     biweight_\w     vmlal.u8, vmlal.u8
1509
20:     rsb             r4,  r4,  #0
1510
        biweight_\w     vmlal.u8, vmlsl.u8
1511
30:     rsb             r4,  r4,  #0
1512
        rsb             r5,  r5,  #0
1513
        biweight_\w     vmlsl.u8, vmlsl.u8
1514
40:     rsb             r5,  r5,  #0
1515
        biweight_\w     vmlsl.u8, vmlal.u8
1516
        .endfunc
1517
        .endm
1518

    
1519
        .macro  biweight_entry w, h, b=1
1520
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1521
        mov             ip,  #\h
1522
.if \b
1523
        b               biweight_h264_pixels_\w\()_neon
1524
.endif
1525
        .endfunc
1526
        .endm
1527

    
1528
        biweight_entry  16, 8
1529
        biweight_entry  16, 16, b=0
1530
        biweight_func   16
1531

    
1532
        biweight_entry  8,  16
1533
        biweight_entry  8,  4
1534
        biweight_entry  8,  8,  b=0
1535
        biweight_func   8
1536

    
1537
        biweight_entry  4,  8
1538
        biweight_entry  4,  2
1539
        biweight_entry  4,  4,  b=0
1540
        biweight_func   4
1541

    
1542
@ Weighted prediction
1543

    
1544
        .macro  weight_16 add
1545
        vdup.8          d0,  r3
1546
1:      subs            ip,  ip,  #2
1547
        vld1.8          {d20-d21},[r0,:128], r1
1548
        vmull.u8        q2,  d0,  d20
1549
        pld             [r0]
1550
        vmull.u8        q3,  d0,  d21
1551
        vld1.8          {d28-d29},[r0,:128], r1
1552
        vmull.u8        q12, d0,  d28
1553
        pld             [r0]
1554
        vmull.u8        q13, d0,  d29
1555
        \add            q2,  q8,  q2
1556
        vrshl.s16       q2,  q2,  q9
1557
        \add            q3,  q8,  q3
1558
        vrshl.s16       q3,  q3,  q9
1559
        vqmovun.s16     d4,  q2
1560
        vqmovun.s16     d5,  q3
1561
        \add            q12, q8,  q12
1562
        vrshl.s16       q12, q12, q9
1563
        \add            q13, q8,  q13
1564
        vrshl.s16       q13, q13, q9
1565
        vqmovun.s16     d24, q12
1566
        vqmovun.s16     d25, q13
1567
        vst1.8          {d4- d5}, [r4,:128], r1
1568
        vst1.8          {d24-d25},[r4,:128], r1
1569
        bne             1b
1570
        pop             {r4, pc}
1571
        .endm
1572

    
1573
        .macro  weight_8 add
1574
        vdup.8          d0,  r3
1575
1:      subs            ip,  ip,  #2
1576
        vld1.8          {d4},[r0,:64], r1
1577
        vmull.u8        q1,  d0,  d4
1578
        pld             [r0]
1579
        vld1.8          {d6},[r0,:64], r1
1580
        vmull.u8        q10, d0,  d6
1581
        \add            q1,  q8,  q1
1582
        pld             [r0]
1583
        vrshl.s16       q1,  q1,  q9
1584
        vqmovun.s16     d2,  q1
1585
        \add            q10, q8,  q10
1586
        vrshl.s16       q10, q10, q9
1587
        vqmovun.s16     d4,  q10
1588
        vst1.8          {d2},[r4,:64], r1
1589
        vst1.8          {d4},[r4,:64], r1
1590
        bne             1b
1591
        pop             {r4, pc}
1592
        .endm
1593

    
1594
        .macro  weight_4 add
1595
        vdup.8          d0,  r3
1596
        vmov            q1,  q8
1597
        vmov            q10, q8
1598
1:      subs            ip,  ip,  #4
1599
        vld1.32         {d4[0]},[r0,:32], r1
1600
        vld1.32         {d4[1]},[r0,:32], r1
1601
        vmull.u8        q1,  d0,  d4
1602
        pld             [r0]
1603
        blt             2f
1604
        vld1.32         {d6[0]},[r0,:32], r1
1605
        vld1.32         {d6[1]},[r0,:32], r1
1606
        vmull.u8        q10, d0,  d6
1607
        pld             [r0]
1608
        \add            q1,  q8,  q1
1609
        vrshl.s16       q1,  q1,  q9
1610
        vqmovun.s16     d2,  q1
1611
        \add            q10, q8,  q10
1612
        vrshl.s16       q10, q10, q9
1613
        vqmovun.s16     d4,  q10
1614
        vmov            q10, q8
1615
        vst1.32         {d2[0]},[r4,:32], r1
1616
        vst1.32         {d2[1]},[r4,:32], r1
1617
        vmov            q1,  q8
1618
        vst1.32         {d4[0]},[r4,:32], r1
1619
        vst1.32         {d4[1]},[r4,:32], r1
1620
        bne             1b
1621
        pop             {r4, pc}
1622
2:      \add            q1,  q8,  q1
1623
        vrshl.s16       q1,  q1,  q9
1624
        vqmovun.s16     d2,  q1
1625
        vst1.32         {d2[0]},[r4,:32], r1
1626
        vst1.32         {d2[1]},[r4,:32], r1
1627
        pop             {r4, pc}
1628
        .endm
1629

    
1630
        .macro  weight_func w
1631
function weight_h264_pixels_\w\()_neon
1632
        push            {r4, lr}
1633
        ldr             r4,  [sp, #8]
1634
        cmp             r2,  #1
1635
        lsl             r4,  r4,  r2
1636
        vdup.16         q8,  r4
1637
        mov             r4,  r0
1638
        ble             20f
1639
        rsb             lr,  r2,  #1
1640
        vdup.16         q9,  lr
1641
        cmp             r3,  #0
1642
        blt             10f
1643
        weight_\w       vhadd.s16
1644
10:     rsb             r3,  r3,  #0
1645
        weight_\w       vhsub.s16
1646
20:     rsb             lr,  r2,  #0
1647
        vdup.16         q9,  lr
1648
        cmp             r3,  #0
1649
        blt             10f
1650
        weight_\w       vadd.s16
1651
10:     rsb             r3,  r3,  #0
1652
        weight_\w       vsub.s16
1653
        .endfunc
1654
        .endm
1655

    
1656
        .macro  weight_entry w, h, b=1
1657
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1658
        mov             ip,  #\h
1659
.if \b
1660
        b               weight_h264_pixels_\w\()_neon
1661
.endif
1662
        .endfunc
1663
        .endm
1664

    
1665
        weight_entry    16, 8
1666
        weight_entry    16, 16, b=0
1667
        weight_func     16
1668

    
1669
        weight_entry    8,  16
1670
        weight_entry    8,  4
1671
        weight_entry    8,  8,  b=0
1672
        weight_func     8
1673

    
1674
        weight_entry    4,  8
1675
        weight_entry    4,  2
1676
        weight_entry    4,  4,  b=0
1677
        weight_func     4