Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264dsp_neon.S @ 2912e87a

History | View | Annotate | Download (61.7 KB)

1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24
        vtrn.32         \r0, \r4
25
        vtrn.32         \r1, \r5
26
        vtrn.32         \r2, \r6
27
        vtrn.32         \r3, \r7
28
        vtrn.16         \r0, \r2
29
        vtrn.16         \r1, \r3
30
        vtrn.16         \r4, \r6
31
        vtrn.16         \r5, \r7
32
        vtrn.8          \r0, \r1
33
        vtrn.8          \r2, \r3
34
        vtrn.8          \r4, \r5
35
        vtrn.8          \r6, \r7
36
        .endm
37

    
38
        .macro transpose_4x4 r0 r1 r2 r3
39
        vtrn.16         \r0, \r2
40
        vtrn.16         \r1, \r3
41
        vtrn.8          \r0, \r1
42
        vtrn.8          \r2, \r3
43
        .endm
44

    
45
        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46
        vswp            \r0, \r4
47
        vswp            \r1, \r5
48
        vswp            \r2, \r6
49
        vswp            \r3, \r7
50
        .endm
51

    
52
        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53
        vtrn.32         \r0, \r2
54
        vtrn.32         \r1, \r3
55
        vtrn.32         \r4, \r6
56
        vtrn.32         \r5, \r7
57
        vtrn.16         \r0, \r1
58
        vtrn.16         \r2, \r3
59
        vtrn.16         \r4, \r5
60
        vtrn.16         \r6, \r7
61
        .endm
62

    
63
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64
        .macro  h264_chroma_mc8 type
65
function ff_\type\()_h264_chroma_mc8_neon, export=1
66
        push            {r4-r7, lr}
67
        ldrd            r4,  [sp, #20]
68
.ifc \type,avg
69
        mov             lr,  r0
70
.endif
71
        pld             [r1]
72
        pld             [r1, r2]
73

    
74
        muls            r7,  r4,  r5
75
        rsb             r6,  r7,  r5,  lsl #3
76
        rsb             ip,  r7,  r4,  lsl #3
77
        sub             r4,  r7,  r4,  lsl #3
78
        sub             r4,  r4,  r5,  lsl #3
79
        add             r4,  r4,  #64
80

    
81
        beq             2f
82

    
83
        add             r5,  r1,  r2
84

    
85
        vdup.8          d0,  r4
86
        lsl             r4,  r2,  #1
87
        vdup.8          d1,  ip
88
        vld1.64         {d4, d5}, [r1], r4
89
        vdup.8          d2,  r6
90
        vld1.64         {d6, d7}, [r5], r4
91
        vdup.8          d3,  r7
92

    
93
        vext.8          d5,  d4,  d5,  #1
94
        vext.8          d7,  d6,  d7,  #1
95

    
96
1:      pld             [r5]
97
        vmull.u8        q8,  d4,  d0
98
        vmlal.u8        q8,  d5,  d1
99
        vld1.64         {d4, d5}, [r1], r4
100
        vmlal.u8        q8,  d6,  d2
101
        vext.8          d5,  d4,  d5,  #1
102
        vmlal.u8        q8,  d7,  d3
103
        vmull.u8        q9,  d6,  d0
104
        subs            r3,  r3,  #2
105
        vmlal.u8        q9,  d7,  d1
106
        vmlal.u8        q9,  d4,  d2
107
        vmlal.u8        q9,  d5,  d3
108
        vrshrn.u16      d16, q8,  #6
109
        vld1.64         {d6, d7}, [r5], r4
110
        pld             [r1]
111
        vrshrn.u16      d17, q9,  #6
112
.ifc \type,avg
113
        vld1.64         {d20}, [lr,:64], r2
114
        vld1.64         {d21}, [lr,:64], r2
115
        vrhadd.u8       q8,  q8,  q10
116
.endif
117
        vext.8          d7,  d6,  d7,  #1
118
        vst1.64         {d16}, [r0,:64], r2
119
        vst1.64         {d17}, [r0,:64], r2
120
        bgt             1b
121

    
122
        pop             {r4-r7, pc}
123

    
124
2:      tst             r6,  r6
125
        add             ip,  ip,  r6
126
        vdup.8          d0,  r4
127
        vdup.8          d1,  ip
128

    
129
        beq             4f
130

    
131
        add             r5,  r1,  r2
132
        lsl             r4,  r2,  #1
133
        vld1.64         {d4}, [r1], r4
134
        vld1.64         {d6}, [r5], r4
135

    
136
3:      pld             [r5]
137
        vmull.u8        q8,  d4,  d0
138
        vmlal.u8        q8,  d6,  d1
139
        vld1.64         {d4}, [r1], r4
140
        vmull.u8        q9,  d6,  d0
141
        vmlal.u8        q9,  d4,  d1
142
        vld1.64         {d6}, [r5], r4
143
        vrshrn.u16      d16, q8,  #6
144
        vrshrn.u16      d17, q9,  #6
145
.ifc \type,avg
146
        vld1.64         {d20}, [lr,:64], r2
147
        vld1.64         {d21}, [lr,:64], r2
148
        vrhadd.u8       q8,  q8,  q10
149
.endif
150
        subs            r3,  r3,  #2
151
        pld             [r1]
152
        vst1.64         {d16}, [r0,:64], r2
153
        vst1.64         {d17}, [r0,:64], r2
154
        bgt             3b
155

    
156
        pop             {r4-r7, pc}
157

    
158
4:      vld1.64         {d4, d5}, [r1], r2
159
        vld1.64         {d6, d7}, [r1], r2
160
        vext.8          d5,  d4,  d5,  #1
161
        vext.8          d7,  d6,  d7,  #1
162

    
163
5:      pld             [r1]
164
        subs            r3,  r3,  #2
165
        vmull.u8        q8,  d4,  d0
166
        vmlal.u8        q8,  d5,  d1
167
        vld1.64         {d4, d5}, [r1], r2
168
        vmull.u8        q9,  d6,  d0
169
        vmlal.u8        q9,  d7,  d1
170
        pld             [r1]
171
        vext.8          d5,  d4,  d5,  #1
172
        vrshrn.u16      d16, q8,  #6
173
        vrshrn.u16      d17, q9,  #6
174
.ifc \type,avg
175
        vld1.64         {d20}, [lr,:64], r2
176
        vld1.64         {d21}, [lr,:64], r2
177
        vrhadd.u8       q8,  q8,  q10
178
.endif
179
        vld1.64         {d6, d7}, [r1], r2
180
        vext.8          d7,  d6,  d7,  #1
181
        vst1.64         {d16}, [r0,:64], r2
182
        vst1.64         {d17}, [r0,:64], r2
183
        bgt             5b
184

    
185
        pop             {r4-r7, pc}
186
endfunc
187
        .endm
188

    
189
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190
        .macro  h264_chroma_mc4 type
191
function ff_\type\()_h264_chroma_mc4_neon, export=1
192
        push            {r4-r7, lr}
193
        ldrd            r4,  [sp, #20]
194
.ifc \type,avg
195
        mov             lr,  r0
196
.endif
197
        pld             [r1]
198
        pld             [r1, r2]
199

    
200
        muls            r7,  r4,  r5
201
        rsb             r6,  r7,  r5,  lsl #3
202
        rsb             ip,  r7,  r4,  lsl #3
203
        sub             r4,  r7,  r4,  lsl #3
204
        sub             r4,  r4,  r5,  lsl #3
205
        add             r4,  r4,  #64
206

    
207
        beq             2f
208

    
209
        add             r5,  r1,  r2
210

    
211
        vdup.8          d0,  r4
212
        lsl             r4,  r2,  #1
213
        vdup.8          d1,  ip
214
        vld1.64         {d4},     [r1], r4
215
        vdup.8          d2,  r6
216
        vld1.64         {d6},     [r5], r4
217
        vdup.8          d3,  r7
218

    
219
        vext.8          d5,  d4,  d5,  #1
220
        vext.8          d7,  d6,  d7,  #1
221
        vtrn.32         d4,  d5
222
        vtrn.32         d6,  d7
223

    
224
        vtrn.32         d0,  d1
225
        vtrn.32         d2,  d3
226

    
227
1:      pld             [r5]
228
        vmull.u8        q8,  d4,  d0
229
        vmlal.u8        q8,  d6,  d2
230
        vld1.64         {d4},     [r1], r4
231
        vext.8          d5,  d4,  d5,  #1
232
        vtrn.32         d4,  d5
233
        vmull.u8        q9,  d6,  d0
234
        vmlal.u8        q9,  d4,  d2
235
        vld1.64         {d6},     [r5], r4
236
        vadd.i16        d16, d16, d17
237
        vadd.i16        d17, d18, d19
238
        vrshrn.u16      d16, q8,  #6
239
        subs            r3,  r3,  #2
240
        pld             [r1]
241
.ifc \type,avg
242
        vld1.32         {d20[0]}, [lr,:32], r2
243
        vld1.32         {d20[1]}, [lr,:32], r2
244
        vrhadd.u8       d16, d16, d20
245
.endif
246
        vext.8          d7,  d6,  d7,  #1
247
        vtrn.32         d6,  d7
248
        vst1.32         {d16[0]}, [r0,:32], r2
249
        vst1.32         {d16[1]}, [r0,:32], r2
250
        bgt             1b
251

    
252
        pop             {r4-r7, pc}
253

    
254
2:      tst             r6,  r6
255
        add             ip,  ip,  r6
256
        vdup.8          d0,  r4
257
        vdup.8          d1,  ip
258
        vtrn.32         d0,  d1
259

    
260
        beq             4f
261

    
262
        vext.32         d1,  d0,  d1,  #1
263
        add             r5,  r1,  r2
264
        lsl             r4,  r2,  #1
265
        vld1.32         {d4[0]},  [r1], r4
266
        vld1.32         {d4[1]},  [r5], r4
267

    
268
3:      pld             [r5]
269
        vmull.u8        q8,  d4,  d0
270
        vld1.32         {d4[0]},  [r1], r4
271
        vmull.u8        q9,  d4,  d1
272
        vld1.32         {d4[1]},  [r5], r4
273
        vadd.i16        d16, d16, d17
274
        vadd.i16        d17, d18, d19
275
        vrshrn.u16      d16, q8,  #6
276
.ifc \type,avg
277
        vld1.32         {d20[0]}, [lr,:32], r2
278
        vld1.32         {d20[1]}, [lr,:32], r2
279
        vrhadd.u8       d16, d16, d20
280
.endif
281
        subs            r3,  r3,  #2
282
        pld             [r1]
283
        vst1.32         {d16[0]}, [r0,:32], r2
284
        vst1.32         {d16[1]}, [r0,:32], r2
285
        bgt             3b
286

    
287
        pop             {r4-r7, pc}
288

    
289
4:      vld1.64         {d4},     [r1], r2
290
        vld1.64         {d6},     [r1], r2
291
        vext.8          d5,  d4,  d5,  #1
292
        vext.8          d7,  d6,  d7,  #1
293
        vtrn.32         d4,  d5
294
        vtrn.32         d6,  d7
295

    
296
5:      vmull.u8        q8,  d4,  d0
297
        vmull.u8        q9,  d6,  d0
298
        subs            r3,  r3,  #2
299
        vld1.64         {d4},     [r1], r2
300
        vext.8          d5,  d4,  d5,  #1
301
        vtrn.32         d4,  d5
302
        vadd.i16        d16, d16, d17
303
        vadd.i16        d17, d18, d19
304
        pld             [r1]
305
        vrshrn.u16      d16, q8,  #6
306
.ifc \type,avg
307
        vld1.32         {d20[0]}, [lr,:32], r2
308
        vld1.32         {d20[1]}, [lr,:32], r2
309
        vrhadd.u8       d16, d16, d20
310
.endif
311
        vld1.64         {d6},     [r1], r2
312
        vext.8          d7,  d6,  d7,  #1
313
        vtrn.32         d6,  d7
314
        pld             [r1]
315
        vst1.32         {d16[0]}, [r0,:32], r2
316
        vst1.32         {d16[1]}, [r0,:32], r2
317
        bgt             5b
318

    
319
        pop             {r4-r7, pc}
320
endfunc
321
        .endm
322

    
323
        .macro  h264_chroma_mc2 type
324
function ff_\type\()_h264_chroma_mc2_neon, export=1
325
        push            {r4-r6, lr}
326
        ldr             r4,  [sp, #16]
327
        ldr             lr,  [sp, #20]
328
        pld             [r1]
329
        pld             [r1, r2]
330
        orrs            r5,  r4,  lr
331
        beq             2f
332

    
333
        mul             r5,  r4,  lr
334
        rsb             r6,  r5,  lr,  lsl #3
335
        rsb             r12, r5,  r4,  lsl #3
336
        sub             r4,  r5,  r4,  lsl #3
337
        sub             r4,  r4,  lr,  lsl #3
338
        add             r4,  r4,  #64
339
        vdup.8          d0,  r4
340
        vdup.8          d2,  r12
341
        vdup.8          d1,  r6
342
        vdup.8          d3,  r5
343
        vtrn.16         q0,  q1
344
1:
345
        vld1.32         {d4[0]},  [r1], r2
346
        vld1.32         {d4[1]},  [r1], r2
347
        vrev64.32       d5,  d4
348
        vld1.32         {d5[1]},  [r1]
349
        vext.8          q3,  q2,  q2,  #1
350
        vtrn.16         q2,  q3
351
        vmull.u8        q8,  d4,  d0
352
        vmlal.u8        q8,  d5,  d1
353
.ifc \type,avg
354
        vld1.16         {d18[0]}, [r0,:16], r2
355
        vld1.16         {d18[1]}, [r0,:16]
356
        sub             r0,  r0,  r2
357
.endif
358
        vtrn.32         d16, d17
359
        vadd.i16        d16, d16, d17
360
        vrshrn.u16      d16, q8,  #6
361
.ifc \type,avg
362
        vrhadd.u8       d16, d16, d18
363
.endif
364
        vst1.16         {d16[0]}, [r0,:16], r2
365
        vst1.16         {d16[1]}, [r0,:16], r2
366
        subs            r3,  r3,  #2
367
        bgt             1b
368
        pop             {r4-r6, pc}
369
2:
370
.ifc \type,put
371
        ldrh            r5,  [r1], r2
372
        strh            r5,  [r0], r2
373
        ldrh            r6,  [r1], r2
374
        strh            r6,  [r0], r2
375
.else
376
        vld1.16         {d16[0]}, [r1], r2
377
        vld1.16         {d16[1]}, [r1], r2
378
        vld1.16         {d18[0]}, [r0,:16], r2
379
        vld1.16         {d18[1]}, [r0,:16]
380
        sub             r0,  r0,  r2
381
        vrhadd.u8       d16, d16, d18
382
        vst1.16         {d16[0]}, [r0,:16], r2
383
        vst1.16         {d16[1]}, [r0,:16], r2
384
.endif
385
        subs            r3,  r3,  #2
386
        bgt             2b
387
        pop             {r4-r6, pc}
388
endfunc
389
.endm
390

    
391
        .text
392
        .align
393

    
394
        h264_chroma_mc8 put
395
        h264_chroma_mc8 avg
396
        h264_chroma_mc4 put
397
        h264_chroma_mc4 avg
398
        h264_chroma_mc2 put
399
        h264_chroma_mc2 avg
400

    
401
        /* H.264 loop filter */
402

    
403
        .macro h264_loop_filter_start
404
        ldr             ip,  [sp]
405
        tst             r2,  r2
406
        ldr             ip,  [ip]
407
        tstne           r3,  r3
408
        vmov.32         d24[0], ip
409
        and             ip,  ip,  ip, lsl #16
410
        bxeq            lr
411
        ands            ip,  ip,  ip, lsl #8
412
        bxlt            lr
413
        .endm
414

    
415
        .macro align_push_regs
416
        and             ip,  sp,  #15
417
        add             ip,  ip,  #32
418
        sub             sp,  sp,  ip
419
        vst1.64         {d12-d15}, [sp,:128]
420
        sub             sp,  sp,  #32
421
        vst1.64         {d8-d11},  [sp,:128]
422
        .endm
423

    
424
        .macro align_pop_regs
425
        vld1.64         {d8-d11},  [sp,:128]!
426
        vld1.64         {d12-d15}, [sp,:128], ip
427
        .endm
428

    
429
        .macro h264_loop_filter_luma
430
        vdup.8          q11, r2         @ alpha
431
        vmovl.u8        q12, d24
432
        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
433
        vmovl.u16       q12, d24
434
        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
435
        vsli.16         q12, q12, #8
436
        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
437
        vsli.32         q12, q12, #16
438
        vclt.u8         q6,  q6,  q11   @ < alpha
439
        vdup.8          q11, r3         @ beta
440
        vclt.s8         q7,  q12, #0
441
        vclt.u8         q14, q14, q11   @ < beta
442
        vclt.u8         q15, q15, q11   @ < beta
443
        vbic            q6,  q6,  q7
444
        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
445
        vand            q6,  q6,  q14
446
        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
447
        vclt.u8         q4,  q4,  q11   @ < beta
448
        vand            q6,  q6,  q15
449
        vclt.u8         q5,  q5,  q11   @ < beta
450
        vand            q4,  q4,  q6
451
        vand            q5,  q5,  q6
452
        vand            q12, q12, q6
453
        vrhadd.u8       q14, q8,  q0
454
        vsub.i8         q6,  q12, q4
455
        vqadd.u8        q7,  q9,  q12
456
        vhadd.u8        q10, q10, q14
457
        vsub.i8         q6,  q6,  q5
458
        vhadd.u8        q14, q2,  q14
459
        vmin.u8         q7,  q7,  q10
460
        vqsub.u8        q11, q9,  q12
461
        vqadd.u8        q2,  q1,  q12
462
        vmax.u8         q7,  q7,  q11
463
        vqsub.u8        q11, q1,  q12
464
        vmin.u8         q14, q2,  q14
465
        vmovl.u8        q2,  d0
466
        vmax.u8         q14, q14, q11
467
        vmovl.u8        q10, d1
468
        vsubw.u8        q2,  q2,  d16
469
        vsubw.u8        q10, q10, d17
470
        vshl.i16        q2,  q2,  #2
471
        vshl.i16        q10, q10, #2
472
        vaddw.u8        q2,  q2,  d18
473
        vaddw.u8        q10, q10, d19
474
        vsubw.u8        q2,  q2,  d2
475
        vsubw.u8        q10, q10, d3
476
        vrshrn.i16      d4,  q2,  #3
477
        vrshrn.i16      d5,  q10, #3
478
        vbsl            q4,  q7,  q9
479
        vbsl            q5,  q14, q1
480
        vneg.s8         q7,  q6
481
        vmovl.u8        q14, d16
482
        vmin.s8         q2,  q2,  q6
483
        vmovl.u8        q6,  d17
484
        vmax.s8         q2,  q2,  q7
485
        vmovl.u8        q11, d0
486
        vmovl.u8        q12, d1
487
        vaddw.s8        q14, q14, d4
488
        vaddw.s8        q6,  q6,  d5
489
        vsubw.s8        q11, q11, d4
490
        vsubw.s8        q12, q12, d5
491
        vqmovun.s16     d16, q14
492
        vqmovun.s16     d17, q6
493
        vqmovun.s16     d0,  q11
494
        vqmovun.s16     d1,  q12
495
        .endm
496

    
497
function ff_h264_v_loop_filter_luma_neon, export=1
498
        h264_loop_filter_start
499

    
500
        vld1.64         {d0, d1},  [r0,:128], r1
501
        vld1.64         {d2, d3},  [r0,:128], r1
502
        vld1.64         {d4, d5},  [r0,:128], r1
503
        sub             r0,  r0,  r1, lsl #2
504
        sub             r0,  r0,  r1, lsl #1
505
        vld1.64         {d20,d21}, [r0,:128], r1
506
        vld1.64         {d18,d19}, [r0,:128], r1
507
        vld1.64         {d16,d17}, [r0,:128], r1
508

    
509
        align_push_regs
510

    
511
        h264_loop_filter_luma
512

    
513
        sub             r0,  r0,  r1, lsl #1
514
        vst1.64         {d8, d9},  [r0,:128], r1
515
        vst1.64         {d16,d17}, [r0,:128], r1
516
        vst1.64         {d0, d1},  [r0,:128], r1
517
        vst1.64         {d10,d11}, [r0,:128]
518

    
519
        align_pop_regs
520
        bx              lr
521
endfunc
522

    
523
function ff_h264_h_loop_filter_luma_neon, export=1
524
        h264_loop_filter_start
525

    
526
        sub             r0,  r0,  #4
527
        vld1.64         {d6},  [r0], r1
528
        vld1.64         {d20}, [r0], r1
529
        vld1.64         {d18}, [r0], r1
530
        vld1.64         {d16}, [r0], r1
531
        vld1.64         {d0},  [r0], r1
532
        vld1.64         {d2},  [r0], r1
533
        vld1.64         {d4},  [r0], r1
534
        vld1.64         {d26}, [r0], r1
535
        vld1.64         {d7},  [r0], r1
536
        vld1.64         {d21}, [r0], r1
537
        vld1.64         {d19}, [r0], r1
538
        vld1.64         {d17}, [r0], r1
539
        vld1.64         {d1},  [r0], r1
540
        vld1.64         {d3},  [r0], r1
541
        vld1.64         {d5},  [r0], r1
542
        vld1.64         {d27}, [r0], r1
543

    
544
        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
545

    
546
        align_push_regs
547

    
548
        h264_loop_filter_luma
549

    
550
        transpose_4x4   q4, q8, q0, q5
551

    
552
        sub             r0,  r0,  r1, lsl #4
553
        add             r0,  r0,  #2
554
        vst1.32         {d8[0]},  [r0], r1
555
        vst1.32         {d16[0]}, [r0], r1
556
        vst1.32         {d0[0]},  [r0], r1
557
        vst1.32         {d10[0]}, [r0], r1
558
        vst1.32         {d8[1]},  [r0], r1
559
        vst1.32         {d16[1]}, [r0], r1
560
        vst1.32         {d0[1]},  [r0], r1
561
        vst1.32         {d10[1]}, [r0], r1
562
        vst1.32         {d9[0]},  [r0], r1
563
        vst1.32         {d17[0]}, [r0], r1
564
        vst1.32         {d1[0]},  [r0], r1
565
        vst1.32         {d11[0]}, [r0], r1
566
        vst1.32         {d9[1]},  [r0], r1
567
        vst1.32         {d17[1]}, [r0], r1
568
        vst1.32         {d1[1]},  [r0], r1
569
        vst1.32         {d11[1]}, [r0], r1
570

    
571
        align_pop_regs
572
        bx              lr
573
endfunc
574

    
575
        .macro h264_loop_filter_chroma
576
        vdup.8          d22, r2         @ alpha
577
        vmovl.u8        q12, d24
578
        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
579
        vmovl.u8        q2,  d0
580
        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
581
        vsubw.u8        q2,  q2,  d16
582
        vsli.16         d24, d24, #8
583
        vshl.i16        q2,  q2,  #2
584
        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
585
        vaddw.u8        q2,  q2,  d18
586
        vclt.u8         d26, d26, d22   @ < alpha
587
        vsubw.u8        q2,  q2,  d2
588
        vdup.8          d22, r3         @ beta
589
        vrshrn.i16      d4,  q2,  #3
590
        vclt.u8         d28, d28, d22   @ < beta
591
        vclt.u8         d30, d30, d22   @ < beta
592
        vmin.s8         d4,  d4,  d24
593
        vneg.s8         d25, d24
594
        vand            d26, d26, d28
595
        vmax.s8         d4,  d4,  d25
596
        vand            d26, d26, d30
597
        vmovl.u8        q11, d0
598
        vand            d4,  d4,  d26
599
        vmovl.u8        q14, d16
600
        vaddw.s8        q14, q14, d4
601
        vsubw.s8        q11, q11, d4
602
        vqmovun.s16     d16, q14
603
        vqmovun.s16     d0,  q11
604
        .endm
605

    
606
function ff_h264_v_loop_filter_chroma_neon, export=1
607
        h264_loop_filter_start
608

    
609
        sub             r0,  r0,  r1, lsl #1
610
        vld1.64         {d18}, [r0,:64], r1
611
        vld1.64         {d16}, [r0,:64], r1
612
        vld1.64         {d0},  [r0,:64], r1
613
        vld1.64         {d2},  [r0,:64]
614

    
615
        h264_loop_filter_chroma
616

    
617
        sub             r0,  r0,  r1, lsl #1
618
        vst1.64         {d16}, [r0,:64], r1
619
        vst1.64         {d0},  [r0,:64], r1
620

    
621
        bx              lr
622
endfunc
623

    
624
function ff_h264_h_loop_filter_chroma_neon, export=1
625
        h264_loop_filter_start
626

    
627
        sub             r0,  r0,  #2
628
        vld1.32         {d18[0]}, [r0], r1
629
        vld1.32         {d16[0]}, [r0], r1
630
        vld1.32         {d0[0]},  [r0], r1
631
        vld1.32         {d2[0]},  [r0], r1
632
        vld1.32         {d18[1]}, [r0], r1
633
        vld1.32         {d16[1]}, [r0], r1
634
        vld1.32         {d0[1]},  [r0], r1
635
        vld1.32         {d2[1]},  [r0], r1
636

    
637
        vtrn.16         d18, d0
638
        vtrn.16         d16, d2
639
        vtrn.8          d18, d16
640
        vtrn.8          d0,  d2
641

    
642
        h264_loop_filter_chroma
643

    
644
        vtrn.16         d18, d0
645
        vtrn.16         d16, d2
646
        vtrn.8          d18, d16
647
        vtrn.8          d0,  d2
648

    
649
        sub             r0,  r0,  r1, lsl #3
650
        vst1.32         {d18[0]}, [r0], r1
651
        vst1.32         {d16[0]}, [r0], r1
652
        vst1.32         {d0[0]},  [r0], r1
653
        vst1.32         {d2[0]},  [r0], r1
654
        vst1.32         {d18[1]}, [r0], r1
655
        vst1.32         {d16[1]}, [r0], r1
656
        vst1.32         {d0[1]},  [r0], r1
657
        vst1.32         {d2[1]},  [r0], r1
658

    
659
        bx              lr
660
endfunc
661

    
662
        /* H.264 qpel MC */
663

    
664
        .macro  lowpass_const r
665
        movw            \r,  #5
666
        movt            \r,  #20
667
        vmov.32         d6[0], \r
668
        .endm
669

    
670
        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
671
.if \narrow
672
        t0 .req q0
673
        t1 .req q8
674
.else
675
        t0 .req \d0
676
        t1 .req \d1
677
.endif
678
        vext.8          d2,  \r0, \r1, #2
679
        vext.8          d3,  \r0, \r1, #3
680
        vaddl.u8        q1,  d2,  d3
681
        vext.8          d4,  \r0, \r1, #1
682
        vext.8          d5,  \r0, \r1, #4
683
        vaddl.u8        q2,  d4,  d5
684
        vext.8          d30, \r0, \r1, #5
685
        vaddl.u8        t0,  \r0, d30
686
        vext.8          d18, \r2, \r3, #2
687
        vmla.i16        t0,  q1,  d6[1]
688
        vext.8          d19, \r2, \r3, #3
689
        vaddl.u8        q9,  d18, d19
690
        vext.8          d20, \r2, \r3, #1
691
        vmls.i16        t0,  q2,  d6[0]
692
        vext.8          d21, \r2, \r3, #4
693
        vaddl.u8        q10, d20, d21
694
        vext.8          d31, \r2, \r3, #5
695
        vaddl.u8        t1,  \r2, d31
696
        vmla.i16        t1,  q9,  d6[1]
697
        vmls.i16        t1,  q10, d6[0]
698
.if \narrow
699
        vqrshrun.s16    \d0, t0,  #5
700
        vqrshrun.s16    \d1, t1,  #5
701
.endif
702
        .unreq  t0
703
        .unreq  t1
704
        .endm
705

    
706
        .macro  lowpass_8_1 r0, r1, d0, narrow=1
707
.if \narrow
708
        t0 .req q0
709
.else
710
        t0 .req \d0
711
.endif
712
        vext.8          d2,  \r0, \r1, #2
713
        vext.8          d3,  \r0, \r1, #3
714
        vaddl.u8        q1,  d2,  d3
715
        vext.8          d4,  \r0, \r1, #1
716
        vext.8          d5,  \r0, \r1, #4
717
        vaddl.u8        q2,  d4,  d5
718
        vext.8          d30, \r0, \r1, #5
719
        vaddl.u8        t0,  \r0, d30
720
        vmla.i16        t0,  q1,  d6[1]
721
        vmls.i16        t0,  q2,  d6[0]
722
.if \narrow
723
        vqrshrun.s16    \d0, t0,  #5
724
.endif
725
        .unreq  t0
726
        .endm
727

    
728
        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
729
        vext.16         q1,  \r0, \r1, #2
730
        vext.16         q0,  \r0, \r1, #3
731
        vaddl.s16       q9,  d2,  d0
732
        vext.16         q2,  \r0, \r1, #1
733
        vaddl.s16       q1,  d3,  d1
734
        vext.16         q3,  \r0, \r1, #4
735
        vaddl.s16       q10, d4,  d6
736
        vext.16         \r1, \r0, \r1, #5
737
        vaddl.s16       q2,  d5,  d7
738
        vaddl.s16       q0,  \h0, \h1
739
        vaddl.s16       q8,  \l0, \l1
740

    
741
        vshl.i32        q3,  q9,  #4
742
        vshl.i32        q9,  q9,  #2
743
        vshl.i32        q15, q10, #2
744
        vadd.i32        q9,  q9,  q3
745
        vadd.i32        q10, q10, q15
746

    
747
        vshl.i32        q3,  q1,  #4
748
        vshl.i32        q1,  q1,  #2
749
        vshl.i32        q15, q2,  #2
750
        vadd.i32        q1,  q1,  q3
751
        vadd.i32        q2,  q2,  q15
752

    
753
        vadd.i32        q9,  q9,  q8
754
        vsub.i32        q9,  q9,  q10
755

    
756
        vadd.i32        q1,  q1,  q0
757
        vsub.i32        q1,  q1,  q2
758

    
759
        vrshrn.s32      d18, q9,  #10
760
        vrshrn.s32      d19, q1,  #10
761

    
762
        vqmovun.s16     \d,  q9
763
        .endm
764

    
765
function put_h264_qpel16_h_lowpass_neon_packed
766
        mov             r4,  lr
767
        mov             ip,  #16
768
        mov             r3,  #8
769
        bl              put_h264_qpel8_h_lowpass_neon
770
        sub             r1,  r1,  r2, lsl #4
771
        add             r1,  r1,  #8
772
        mov             ip,  #16
773
        mov             lr,  r4
774
        b               put_h264_qpel8_h_lowpass_neon
775
endfunc
776

    
777
        .macro h264_qpel_h_lowpass type
778
function \type\()_h264_qpel16_h_lowpass_neon
779
        push            {lr}
780
        mov             ip,  #16
781
        bl              \type\()_h264_qpel8_h_lowpass_neon
782
        sub             r0,  r0,  r3, lsl #4
783
        sub             r1,  r1,  r2, lsl #4
784
        add             r0,  r0,  #8
785
        add             r1,  r1,  #8
786
        mov             ip,  #16
787
        pop             {lr}
788
endfunc
789

    
790
function \type\()_h264_qpel8_h_lowpass_neon
791
1:      vld1.64         {d0, d1},  [r1], r2
792
        vld1.64         {d16,d17}, [r1], r2
793
        subs            ip,  ip,  #2
794
        lowpass_8       d0,  d1,  d16, d17, d0,  d16
795
.ifc \type,avg
796
        vld1.8          {d2},     [r0,:64], r3
797
        vrhadd.u8       d0,  d0,  d2
798
        vld1.8          {d3},     [r0,:64]
799
        vrhadd.u8       d16, d16, d3
800
        sub             r0,  r0,  r3
801
.endif
802
        vst1.64         {d0},     [r0,:64], r3
803
        vst1.64         {d16},    [r0,:64], r3
804
        bne             1b
805
        bx              lr
806
endfunc
807
        .endm
808

    
809
        h264_qpel_h_lowpass put
810
        h264_qpel_h_lowpass avg
811

    
812
        .macro h264_qpel_h_lowpass_l2 type
813
function \type\()_h264_qpel16_h_lowpass_l2_neon
814
        push            {lr}
815
        mov             ip,  #16
816
        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
817
        sub             r0,  r0,  r2, lsl #4
818
        sub             r1,  r1,  r2, lsl #4
819
        sub             r3,  r3,  r2, lsl #4
820
        add             r0,  r0,  #8
821
        add             r1,  r1,  #8
822
        add             r3,  r3,  #8
823
        mov             ip,  #16
824
        pop             {lr}
825
endfunc
826

    
827
function \type\()_h264_qpel8_h_lowpass_l2_neon
828
1:      vld1.64         {d0, d1},  [r1], r2
829
        vld1.64         {d16,d17}, [r1], r2
830
        vld1.64         {d28},     [r3], r2
831
        vld1.64         {d29},     [r3], r2
832
        subs            ip,  ip,  #2
833
        lowpass_8       d0,  d1,  d16, d17, d0,  d1
834
        vrhadd.u8       q0,  q0,  q14
835
.ifc \type,avg
836
        vld1.8          {d2},      [r0,:64], r2
837
        vrhadd.u8       d0,  d0,  d2
838
        vld1.8          {d3},      [r0,:64]
839
        vrhadd.u8       d1,  d1,  d3
840
        sub             r0,  r0,  r2
841
.endif
842
        vst1.64         {d0},      [r0,:64], r2
843
        vst1.64         {d1},      [r0,:64], r2
844
        bne             1b
845
        bx              lr
846
endfunc
847
        .endm
848

    
849
        h264_qpel_h_lowpass_l2 put
850
        h264_qpel_h_lowpass_l2 avg
851

    
852
function put_h264_qpel16_v_lowpass_neon_packed
853
        mov             r4,  lr
854
        mov             r2,  #8
855
        bl              put_h264_qpel8_v_lowpass_neon
856
        sub             r1,  r1,  r3, lsl #2
857
        bl              put_h264_qpel8_v_lowpass_neon
858
        sub             r1,  r1,  r3, lsl #4
859
        sub             r1,  r1,  r3, lsl #2
860
        add             r1,  r1,  #8
861
        bl              put_h264_qpel8_v_lowpass_neon
862
        sub             r1,  r1,  r3, lsl #2
863
        mov             lr,  r4
864
        b               put_h264_qpel8_v_lowpass_neon
865
endfunc
866

    
867
        .macro h264_qpel_v_lowpass type
868
function \type\()_h264_qpel16_v_lowpass_neon
869
        mov             r4,  lr
870
        bl              \type\()_h264_qpel8_v_lowpass_neon
871
        sub             r1,  r1,  r3, lsl #2
872
        bl              \type\()_h264_qpel8_v_lowpass_neon
873
        sub             r0,  r0,  r2, lsl #4
874
        add             r0,  r0,  #8
875
        sub             r1,  r1,  r3, lsl #4
876
        sub             r1,  r1,  r3, lsl #2
877
        add             r1,  r1,  #8
878
        bl              \type\()_h264_qpel8_v_lowpass_neon
879
        sub             r1,  r1,  r3, lsl #2
880
        mov             lr,  r4
881
endfunc
882

    
883
function \type\()_h264_qpel8_v_lowpass_neon
884
        vld1.64         {d8},  [r1], r3
885
        vld1.64         {d10}, [r1], r3
886
        vld1.64         {d12}, [r1], r3
887
        vld1.64         {d14}, [r1], r3
888
        vld1.64         {d22}, [r1], r3
889
        vld1.64         {d24}, [r1], r3
890
        vld1.64         {d26}, [r1], r3
891
        vld1.64         {d28}, [r1], r3
892
        vld1.64         {d9},  [r1], r3
893
        vld1.64         {d11}, [r1], r3
894
        vld1.64         {d13}, [r1], r3
895
        vld1.64         {d15}, [r1], r3
896
        vld1.64         {d23}, [r1]
897

    
898
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
899
        lowpass_8       d8,  d9,  d10, d11, d8,  d10
900
        lowpass_8       d12, d13, d14, d15, d12, d14
901
        lowpass_8       d22, d23, d24, d25, d22, d24
902
        lowpass_8       d26, d27, d28, d29, d26, d28
903
        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
904

    
905
.ifc \type,avg
906
        vld1.8          {d9},  [r0,:64], r2
907
        vrhadd.u8       d8,  d8,  d9
908
        vld1.8          {d11}, [r0,:64], r2
909
        vrhadd.u8       d10, d10, d11
910
        vld1.8          {d13}, [r0,:64], r2
911
        vrhadd.u8       d12, d12, d13
912
        vld1.8          {d15}, [r0,:64], r2
913
        vrhadd.u8       d14, d14, d15
914
        vld1.8          {d23}, [r0,:64], r2
915
        vrhadd.u8       d22, d22, d23
916
        vld1.8          {d25}, [r0,:64], r2
917
        vrhadd.u8       d24, d24, d25
918
        vld1.8          {d27}, [r0,:64], r2
919
        vrhadd.u8       d26, d26, d27
920
        vld1.8          {d29}, [r0,:64], r2
921
        vrhadd.u8       d28, d28, d29
922
        sub             r0,  r0,  r2,  lsl #3
923
.endif
924

    
925
        vst1.64         {d8},  [r0,:64], r2
926
        vst1.64         {d10}, [r0,:64], r2
927
        vst1.64         {d12}, [r0,:64], r2
928
        vst1.64         {d14}, [r0,:64], r2
929
        vst1.64         {d22}, [r0,:64], r2
930
        vst1.64         {d24}, [r0,:64], r2
931
        vst1.64         {d26}, [r0,:64], r2
932
        vst1.64         {d28}, [r0,:64], r2
933

    
934
        bx              lr
935
endfunc
936
        .endm
937

    
938
        h264_qpel_v_lowpass put
939
        h264_qpel_v_lowpass avg
940

    
941
        .macro h264_qpel_v_lowpass_l2 type
942
function \type\()_h264_qpel16_v_lowpass_l2_neon
943
        mov             r4,  lr
944
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
945
        sub             r1,  r1,  r3, lsl #2
946
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
947
        sub             r0,  r0,  r3, lsl #4
948
        sub             ip,  ip,  r2, lsl #4
949
        add             r0,  r0,  #8
950
        add             ip,  ip,  #8
951
        sub             r1,  r1,  r3, lsl #4
952
        sub             r1,  r1,  r3, lsl #2
953
        add             r1,  r1,  #8
954
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
955
        sub             r1,  r1,  r3, lsl #2
956
        mov             lr,  r4
957
endfunc
958

    
959
function \type\()_h264_qpel8_v_lowpass_l2_neon
960
        vld1.64         {d8},  [r1], r3
961
        vld1.64         {d10}, [r1], r3
962
        vld1.64         {d12}, [r1], r3
963
        vld1.64         {d14}, [r1], r3
964
        vld1.64         {d22}, [r1], r3
965
        vld1.64         {d24}, [r1], r3
966
        vld1.64         {d26}, [r1], r3
967
        vld1.64         {d28}, [r1], r3
968
        vld1.64         {d9},  [r1], r3
969
        vld1.64         {d11}, [r1], r3
970
        vld1.64         {d13}, [r1], r3
971
        vld1.64         {d15}, [r1], r3
972
        vld1.64         {d23}, [r1]
973

    
974
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
975
        lowpass_8       d8,  d9,  d10, d11, d8,  d9
976
        lowpass_8       d12, d13, d14, d15, d12, d13
977
        lowpass_8       d22, d23, d24, d25, d22, d23
978
        lowpass_8       d26, d27, d28, d29, d26, d27
979
        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
980

    
981
        vld1.64         {d0},  [ip], r2
982
        vld1.64         {d1},  [ip], r2
983
        vld1.64         {d2},  [ip], r2
984
        vld1.64         {d3},  [ip], r2
985
        vld1.64         {d4},  [ip], r2
986
        vrhadd.u8       q0,  q0,  q4
987
        vld1.64         {d5},  [ip], r2
988
        vrhadd.u8       q1,  q1,  q6
989
        vld1.64         {d10}, [ip], r2
990
        vrhadd.u8       q2,  q2,  q11
991
        vld1.64         {d11}, [ip], r2
992
        vrhadd.u8       q5,  q5,  q13
993

    
994
.ifc \type,avg
995
        vld1.8          {d16}, [r0,:64], r3
996
        vrhadd.u8       d0,  d0,  d16
997
        vld1.8          {d17}, [r0,:64], r3
998
        vrhadd.u8       d1,  d1,  d17
999
        vld1.8          {d16}, [r0,:64], r3
1000
        vrhadd.u8       d2,  d2,  d16
1001
        vld1.8          {d17}, [r0,:64], r3
1002
        vrhadd.u8       d3,  d3,  d17
1003
        vld1.8          {d16}, [r0,:64], r3
1004
        vrhadd.u8       d4,  d4,  d16
1005
        vld1.8          {d17}, [r0,:64], r3
1006
        vrhadd.u8       d5,  d5,  d17
1007
        vld1.8          {d16}, [r0,:64], r3
1008
        vrhadd.u8       d10, d10, d16
1009
        vld1.8          {d17}, [r0,:64], r3
1010
        vrhadd.u8       d11, d11, d17
1011
        sub             r0,  r0,  r3,  lsl #3
1012
.endif
1013

    
1014
        vst1.64         {d0},  [r0,:64], r3
1015
        vst1.64         {d1},  [r0,:64], r3
1016
        vst1.64         {d2},  [r0,:64], r3
1017
        vst1.64         {d3},  [r0,:64], r3
1018
        vst1.64         {d4},  [r0,:64], r3
1019
        vst1.64         {d5},  [r0,:64], r3
1020
        vst1.64         {d10}, [r0,:64], r3
1021
        vst1.64         {d11}, [r0,:64], r3
1022

    
1023
        bx              lr
1024
endfunc
1025
        .endm
1026

    
1027
        h264_qpel_v_lowpass_l2 put
1028
        h264_qpel_v_lowpass_l2 avg
1029

    
1030
function put_h264_qpel8_hv_lowpass_neon_top
1031
        lowpass_const   ip
1032
        mov             ip,  #12
1033
1:      vld1.64         {d0, d1},  [r1], r3
1034
        vld1.64         {d16,d17}, [r1], r3
1035
        subs            ip,  ip,  #2
1036
        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
1037
        vst1.64         {d22-d25}, [r4,:128]!
1038
        bne             1b
1039

    
1040
        vld1.64         {d0, d1},  [r1]
1041
        lowpass_8_1     d0,  d1,  q12, narrow=0
1042

    
1043
        mov             ip,  #-16
1044
        add             r4,  r4,  ip
1045
        vld1.64         {d30,d31}, [r4,:128], ip
1046
        vld1.64         {d20,d21}, [r4,:128], ip
1047
        vld1.64         {d18,d19}, [r4,:128], ip
1048
        vld1.64         {d16,d17}, [r4,:128], ip
1049
        vld1.64         {d14,d15}, [r4,:128], ip
1050
        vld1.64         {d12,d13}, [r4,:128], ip
1051
        vld1.64         {d10,d11}, [r4,:128], ip
1052
        vld1.64         {d8, d9},  [r4,:128], ip
1053
        vld1.64         {d6, d7},  [r4,:128], ip
1054
        vld1.64         {d4, d5},  [r4,:128], ip
1055
        vld1.64         {d2, d3},  [r4,:128], ip
1056
        vld1.64         {d0, d1},  [r4,:128]
1057

    
1058
        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
1059
        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
1060

    
1061
        swap4           d17, d19, d21, d31, d24, d26, d28, d22
1062
        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
1063

    
1064
        vst1.64         {d30,d31}, [r4,:128]!
1065
        vst1.64         {d6, d7},  [r4,:128]!
1066
        vst1.64         {d20,d21}, [r4,:128]!
1067
        vst1.64         {d4, d5},  [r4,:128]!
1068
        vst1.64         {d18,d19}, [r4,:128]!
1069
        vst1.64         {d2, d3},  [r4,:128]!
1070
        vst1.64         {d16,d17}, [r4,:128]!
1071
        vst1.64         {d0, d1},  [r4,:128]
1072

    
1073
        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
1074
        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
1075
        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
1076
        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
1077

    
1078
        vld1.64         {d16,d17}, [r4,:128], ip
1079
        vld1.64         {d30,d31}, [r4,:128], ip
1080
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
1081
        vld1.64         {d16,d17}, [r4,:128], ip
1082
        vld1.64         {d30,d31}, [r4,:128], ip
1083
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
1084
        vld1.64         {d16,d17}, [r4,:128], ip
1085
        vld1.64         {d30,d31}, [r4,:128], ip
1086
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
1087
        vld1.64         {d16,d17}, [r4,:128], ip
1088
        vld1.64         {d30,d31}, [r4,:128]
1089
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
1090

    
1091
        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
1092

    
1093
        bx              lr
1094
endfunc
1095

    
1096
        .macro h264_qpel8_hv_lowpass type
1097
function \type\()_h264_qpel8_hv_lowpass_neon
1098
        mov             r10, lr
1099
        bl              put_h264_qpel8_hv_lowpass_neon_top
1100
.ifc \type,avg
1101
        vld1.8          {d0},      [r0,:64], r2
1102
        vrhadd.u8       d12, d12, d0
1103
        vld1.8          {d1},      [r0,:64], r2
1104
        vrhadd.u8       d13, d13, d1
1105
        vld1.8          {d2},      [r0,:64], r2
1106
        vrhadd.u8       d14, d14, d2
1107
        vld1.8          {d3},      [r0,:64], r2
1108
        vrhadd.u8       d15, d15, d3
1109
        vld1.8          {d4},      [r0,:64], r2
1110
        vrhadd.u8       d8,  d8,  d4
1111
        vld1.8          {d5},      [r0,:64], r2
1112
        vrhadd.u8       d9,  d9,  d5
1113
        vld1.8          {d6},      [r0,:64], r2
1114
        vrhadd.u8       d10, d10, d6
1115
        vld1.8          {d7},      [r0,:64], r2
1116
        vrhadd.u8       d11, d11, d7
1117
        sub             r0,  r0,  r2,  lsl #3
1118
.endif
1119
        vst1.64         {d12},     [r0,:64], r2
1120
        vst1.64         {d13},     [r0,:64], r2
1121
        vst1.64         {d14},     [r0,:64], r2
1122
        vst1.64         {d15},     [r0,:64], r2
1123
        vst1.64         {d8},      [r0,:64], r2
1124
        vst1.64         {d9},      [r0,:64], r2
1125
        vst1.64         {d10},     [r0,:64], r2
1126
        vst1.64         {d11},     [r0,:64], r2
1127

    
1128
        mov             lr,  r10
1129
        bx              lr
1130
endfunc
1131
        .endm
1132

    
1133
        h264_qpel8_hv_lowpass put
1134
        h264_qpel8_hv_lowpass avg
1135

    
1136
        .macro h264_qpel8_hv_lowpass_l2 type
1137
function \type\()_h264_qpel8_hv_lowpass_l2_neon
1138
        mov             r10, lr
1139
        bl              put_h264_qpel8_hv_lowpass_neon_top
1140

    
1141
        vld1.64         {d0, d1},  [r2,:128]!
1142
        vld1.64         {d2, d3},  [r2,:128]!
1143
        vrhadd.u8       q0,  q0,  q6
1144
        vld1.64         {d4, d5},  [r2,:128]!
1145
        vrhadd.u8       q1,  q1,  q7
1146
        vld1.64         {d6, d7},  [r2,:128]!
1147
        vrhadd.u8       q2,  q2,  q4
1148
        vrhadd.u8       q3,  q3,  q5
1149
.ifc \type,avg
1150
        vld1.8          {d16},     [r0,:64], r3
1151
        vrhadd.u8       d0,  d0,  d16
1152
        vld1.8          {d17},     [r0,:64], r3
1153
        vrhadd.u8       d1,  d1,  d17
1154
        vld1.8          {d18},     [r0,:64], r3
1155
        vrhadd.u8       d2,  d2,  d18
1156
        vld1.8          {d19},     [r0,:64], r3
1157
        vrhadd.u8       d3,  d3,  d19
1158
        vld1.8          {d20},     [r0,:64], r3
1159
        vrhadd.u8       d4,  d4,  d20
1160
        vld1.8          {d21},     [r0,:64], r3
1161
        vrhadd.u8       d5,  d5,  d21
1162
        vld1.8          {d22},     [r0,:64], r3
1163
        vrhadd.u8       d6,  d6,  d22
1164
        vld1.8          {d23},     [r0,:64], r3
1165
        vrhadd.u8       d7,  d7,  d23
1166
        sub             r0,  r0,  r3,  lsl #3
1167
.endif
1168
        vst1.64         {d0},      [r0,:64], r3
1169
        vst1.64         {d1},      [r0,:64], r3
1170
        vst1.64         {d2},      [r0,:64], r3
1171
        vst1.64         {d3},      [r0,:64], r3
1172
        vst1.64         {d4},      [r0,:64], r3
1173
        vst1.64         {d5},      [r0,:64], r3
1174
        vst1.64         {d6},      [r0,:64], r3
1175
        vst1.64         {d7},      [r0,:64], r3
1176

    
1177
        mov             lr,  r10
1178
        bx              lr
1179
endfunc
1180
        .endm
1181

    
1182
        h264_qpel8_hv_lowpass_l2 put
1183
        h264_qpel8_hv_lowpass_l2 avg
1184

    
1185
        .macro h264_qpel16_hv type
1186
function \type\()_h264_qpel16_hv_lowpass_neon
1187
        mov             r9,  lr
1188
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1189
        sub             r1,  r1,  r3, lsl #2
1190
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1191
        sub             r1,  r1,  r3, lsl #4
1192
        sub             r1,  r1,  r3, lsl #2
1193
        add             r1,  r1,  #8
1194
        sub             r0,  r0,  r2, lsl #4
1195
        add             r0,  r0,  #8
1196
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1197
        sub             r1,  r1,  r3, lsl #2
1198
        mov             lr,  r9
1199
        b               \type\()_h264_qpel8_hv_lowpass_neon
1200
endfunc
1201

    
1202
function \type\()_h264_qpel16_hv_lowpass_l2_neon
1203
        mov             r9,  lr
1204
        sub             r2,  r4,  #256
1205
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1206
        sub             r1,  r1,  r3, lsl #2
1207
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1208
        sub             r1,  r1,  r3, lsl #4
1209
        sub             r1,  r1,  r3, lsl #2
1210
        add             r1,  r1,  #8
1211
        sub             r0,  r0,  r3, lsl #4
1212
        add             r0,  r0,  #8
1213
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1214
        sub             r1,  r1,  r3, lsl #2
1215
        mov             lr,  r9
1216
        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
1217
endfunc
1218
        .endm
1219

    
1220
        h264_qpel16_hv put
1221
        h264_qpel16_hv avg
1222

    
1223
        .macro h264_qpel8 type
1224
function ff_\type\()_h264_qpel8_mc10_neon, export=1
1225
        lowpass_const   r3
1226
        mov             r3,  r1
1227
        sub             r1,  r1,  #2
1228
        mov             ip,  #8
1229
        b               \type\()_h264_qpel8_h_lowpass_l2_neon
1230
endfunc
1231

    
1232
function ff_\type\()_h264_qpel8_mc20_neon, export=1
1233
        lowpass_const   r3
1234
        sub             r1,  r1,  #2
1235
        mov             r3,  r2
1236
        mov             ip,  #8
1237
        b               \type\()_h264_qpel8_h_lowpass_neon
1238
endfunc
1239

    
1240
function ff_\type\()_h264_qpel8_mc30_neon, export=1
1241
        lowpass_const   r3
1242
        add             r3,  r1,  #1
1243
        sub             r1,  r1,  #2
1244
        mov             ip,  #8
1245
        b               \type\()_h264_qpel8_h_lowpass_l2_neon
1246
endfunc
1247

    
1248
function ff_\type\()_h264_qpel8_mc01_neon, export=1
1249
        push            {lr}
1250
        mov             ip,  r1
1251
\type\()_h264_qpel8_mc01:
1252
        lowpass_const   r3
1253
        mov             r3,  r2
1254
        sub             r1,  r1,  r2, lsl #1
1255
        vpush           {d8-d15}
1256
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1257
        vpop            {d8-d15}
1258
        pop             {pc}
1259
endfunc
1260

    
1261
function ff_\type\()_h264_qpel8_mc11_neon, export=1
1262
        push            {r0, r1, r11, lr}
1263
\type\()_h264_qpel8_mc11:
1264
        lowpass_const   r3
1265
        mov             r11, sp
1266
        bic             sp,  sp,  #15
1267
        sub             sp,  sp,  #64
1268
        mov             r0,  sp
1269
        sub             r1,  r1,  #2
1270
        mov             r3,  #8
1271
        mov             ip,  #8
1272
        vpush           {d8-d15}
1273
        bl              put_h264_qpel8_h_lowpass_neon
1274
        ldrd            r0,  [r11]
1275
        mov             r3,  r2
1276
        add             ip,  sp,  #64
1277
        sub             r1,  r1,  r2, lsl #1
1278
        mov             r2,  #8
1279
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1280
        vpop            {d8-d15}
1281
        add             sp,  r11, #8
1282
        pop             {r11, pc}
1283
endfunc
1284

    
1285
function ff_\type\()_h264_qpel8_mc21_neon, export=1
1286
        push            {r0, r1, r4, r10, r11, lr}
1287
\type\()_h264_qpel8_mc21:
1288
        lowpass_const   r3
1289
        mov             r11, sp
1290
        bic             sp,  sp,  #15
1291
        sub             sp,  sp,  #(8*8+16*12)
1292
        sub             r1,  r1,  #2
1293
        mov             r3,  #8
1294
        mov             r0,  sp
1295
        mov             ip,  #8
1296
        vpush           {d8-d15}
1297
        bl              put_h264_qpel8_h_lowpass_neon
1298
        mov             r4,  r0
1299
        ldrd            r0,  [r11]
1300
        sub             r1,  r1,  r2, lsl #1
1301
        sub             r1,  r1,  #2
1302
        mov             r3,  r2
1303
        sub             r2,  r4,  #64
1304
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1305
        vpop            {d8-d15}
1306
        add             sp,  r11,  #8
1307
        pop             {r4, r10, r11, pc}
1308
endfunc
1309

    
1310
function ff_\type\()_h264_qpel8_mc31_neon, export=1
1311
        add             r1,  r1,  #1
1312
        push            {r0, r1, r11, lr}
1313
        sub             r1,  r1,  #1
1314
        b               \type\()_h264_qpel8_mc11
1315
endfunc
1316

    
1317
function ff_\type\()_h264_qpel8_mc02_neon, export=1
1318
        push            {lr}
1319
        lowpass_const   r3
1320
        sub             r1,  r1,  r2, lsl #1
1321
        mov             r3,  r2
1322
        vpush           {d8-d15}
1323
        bl              \type\()_h264_qpel8_v_lowpass_neon
1324
        vpop            {d8-d15}
1325
        pop             {pc}
1326
endfunc
1327

    
1328
function ff_\type\()_h264_qpel8_mc12_neon, export=1
1329
        push            {r0, r1, r4, r10, r11, lr}
1330
\type\()_h264_qpel8_mc12:
1331
        lowpass_const   r3
1332
        mov             r11, sp
1333
        bic             sp,  sp,  #15
1334
        sub             sp,  sp,  #(8*8+16*12)
1335
        sub             r1,  r1,  r2, lsl #1
1336
        mov             r3,  r2
1337
        mov             r2,  #8
1338
        mov             r0,  sp
1339
        vpush           {d8-d15}
1340
        bl              put_h264_qpel8_v_lowpass_neon
1341
        mov             r4,  r0
1342
        ldrd            r0,  [r11]
1343
        sub             r1,  r1,  r3, lsl #1
1344
        sub             r1,  r1,  #2
1345
        sub             r2,  r4,  #64
1346
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1347
        vpop            {d8-d15}
1348
        add             sp,  r11,  #8
1349
        pop             {r4, r10, r11, pc}
1350
endfunc
1351

    
1352
function ff_\type\()_h264_qpel8_mc22_neon, export=1
1353
        push            {r4, r10, r11, lr}
1354
        mov             r11, sp
1355
        bic             sp,  sp,  #15
1356
        sub             r1,  r1,  r2, lsl #1
1357
        sub             r1,  r1,  #2
1358
        mov             r3,  r2
1359
        sub             sp,  sp,  #(16*12)
1360
        mov             r4,  sp
1361
        vpush           {d8-d15}
1362
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1363
        vpop            {d8-d15}
1364
        mov             sp,  r11
1365
        pop             {r4, r10, r11, pc}
1366
endfunc
1367

    
1368
function ff_\type\()_h264_qpel8_mc32_neon, export=1
1369
        push            {r0, r1, r4, r10, r11, lr}
1370
        add             r1,  r1,  #1
1371
        b               \type\()_h264_qpel8_mc12
1372
endfunc
1373

    
1374
function ff_\type\()_h264_qpel8_mc03_neon, export=1
1375
        push            {lr}
1376
        add             ip,  r1,  r2
1377
        b               \type\()_h264_qpel8_mc01
1378
endfunc
1379

    
1380
function ff_\type\()_h264_qpel8_mc13_neon, export=1
1381
        push            {r0, r1, r11, lr}
1382
        add             r1,  r1,  r2
1383
        b               \type\()_h264_qpel8_mc11
1384
endfunc
1385

    
1386
function ff_\type\()_h264_qpel8_mc23_neon, export=1
1387
        push            {r0, r1, r4, r10, r11, lr}
1388
        add             r1,  r1,  r2
1389
        b               \type\()_h264_qpel8_mc21
1390
endfunc
1391

    
1392
function ff_\type\()_h264_qpel8_mc33_neon, export=1
1393
        add             r1,  r1,  #1
1394
        push            {r0, r1, r11, lr}
1395
        add             r1,  r1,  r2
1396
        sub             r1,  r1,  #1
1397
        b               \type\()_h264_qpel8_mc11
1398
endfunc
1399
        .endm
1400

    
1401
        h264_qpel8 put
1402
        h264_qpel8 avg
1403

    
1404
        .macro h264_qpel16 type
1405
function ff_\type\()_h264_qpel16_mc10_neon, export=1
1406
        lowpass_const   r3
1407
        mov             r3,  r1
1408
        sub             r1,  r1,  #2
1409
        b               \type\()_h264_qpel16_h_lowpass_l2_neon
1410
endfunc
1411

    
1412
function ff_\type\()_h264_qpel16_mc20_neon, export=1
1413
        lowpass_const   r3
1414
        sub             r1,  r1,  #2
1415
        mov             r3,  r2
1416
        b               \type\()_h264_qpel16_h_lowpass_neon
1417
endfunc
1418

    
1419
function ff_\type\()_h264_qpel16_mc30_neon, export=1
1420
        lowpass_const   r3
1421
        add             r3,  r1,  #1
1422
        sub             r1,  r1,  #2
1423
        b               \type\()_h264_qpel16_h_lowpass_l2_neon
1424
endfunc
1425

    
1426
function ff_\type\()_h264_qpel16_mc01_neon, export=1
1427
        push            {r4, lr}
1428
        mov             ip,  r1
1429
\type\()_h264_qpel16_mc01:
1430
        lowpass_const   r3
1431
        mov             r3,  r2
1432
        sub             r1,  r1,  r2, lsl #1
1433
        vpush           {d8-d15}
1434
        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1435
        vpop            {d8-d15}
1436
        pop             {r4, pc}
1437
endfunc
1438

    
1439
function ff_\type\()_h264_qpel16_mc11_neon, export=1
1440
        push            {r0, r1, r4, r11, lr}
1441
\type\()_h264_qpel16_mc11:
1442
        lowpass_const   r3
1443
        mov             r11, sp
1444
        bic             sp,  sp,  #15
1445
        sub             sp,  sp,  #256
1446
        mov             r0,  sp
1447
        sub             r1,  r1,  #2
1448
        mov             r3,  #16
1449
        vpush           {d8-d15}
1450
        bl              put_h264_qpel16_h_lowpass_neon
1451
        ldrd            r0,  [r11]
1452
        mov             r3,  r2
1453
        add             ip,  sp,  #64
1454
        sub             r1,  r1,  r2, lsl #1
1455
        mov             r2,  #16
1456
        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1457
        vpop            {d8-d15}
1458
        add             sp,  r11, #8
1459
        pop             {r4, r11, pc}
1460
endfunc
1461

    
1462
function ff_\type\()_h264_qpel16_mc21_neon, export=1
1463
        push            {r0, r1, r4-r5, r9-r11, lr}
1464
\type\()_h264_qpel16_mc21:
1465
        lowpass_const   r3
1466
        mov             r11, sp
1467
        bic             sp,  sp,  #15
1468
        sub             sp,  sp,  #(16*16+16*12)
1469
        sub             r1,  r1,  #2
1470
        mov             r0,  sp
1471
        vpush           {d8-d15}
1472
        bl              put_h264_qpel16_h_lowpass_neon_packed
1473
        mov             r4,  r0
1474
        ldrd            r0,  [r11]
1475
        sub             r1,  r1,  r2, lsl #1
1476
        sub             r1,  r1,  #2
1477
        mov             r3,  r2
1478
        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1479
        vpop            {d8-d15}
1480
        add             sp,  r11,  #8
1481
        pop             {r4-r5, r9-r11, pc}
1482
endfunc
1483

    
1484
function ff_\type\()_h264_qpel16_mc31_neon, export=1
1485
        add             r1,  r1,  #1
1486
        push            {r0, r1, r4, r11, lr}
1487
        sub             r1,  r1,  #1
1488
        b               \type\()_h264_qpel16_mc11
1489
endfunc
1490

    
1491
function ff_\type\()_h264_qpel16_mc02_neon, export=1
1492
        push            {r4, lr}
1493
        lowpass_const   r3
1494
        sub             r1,  r1,  r2, lsl #1
1495
        mov             r3,  r2
1496
        vpush           {d8-d15}
1497
        bl              \type\()_h264_qpel16_v_lowpass_neon
1498
        vpop            {d8-d15}
1499
        pop             {r4, pc}
1500
endfunc
1501

    
1502
function ff_\type\()_h264_qpel16_mc12_neon, export=1
1503
        push            {r0, r1, r4-r5, r9-r11, lr}
1504
\type\()_h264_qpel16_mc12:
1505
        lowpass_const   r3
1506
        mov             r11, sp
1507
        bic             sp,  sp,  #15
1508
        sub             sp,  sp,  #(16*16+16*12)
1509
        sub             r1,  r1,  r2, lsl #1
1510
        mov             r0,  sp
1511
        mov             r3,  r2
1512
        vpush           {d8-d15}
1513
        bl              put_h264_qpel16_v_lowpass_neon_packed
1514
        mov             r4,  r0
1515
        ldrd            r0,  [r11]
1516
        sub             r1,  r1,  r3, lsl #1
1517
        sub             r1,  r1,  #2
1518
        mov             r2,  r3
1519
        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1520
        vpop            {d8-d15}
1521
        add             sp,  r11,  #8
1522
        pop             {r4-r5, r9-r11, pc}
1523
endfunc
1524

    
1525
function ff_\type\()_h264_qpel16_mc22_neon, export=1
1526
        push            {r4, r9-r11, lr}
1527
        lowpass_const   r3
1528
        mov             r11, sp
1529
        bic             sp,  sp,  #15
1530
        sub             r1,  r1,  r2, lsl #1
1531
        sub             r1,  r1,  #2
1532
        mov             r3,  r2
1533
        sub             sp,  sp,  #(16*12)
1534
        mov             r4,  sp
1535
        vpush           {d8-d15}
1536
        bl              \type\()_h264_qpel16_hv_lowpass_neon
1537
        vpop            {d8-d15}
1538
        mov             sp,  r11
1539
        pop             {r4, r9-r11, pc}
1540
endfunc
1541

    
1542
function ff_\type\()_h264_qpel16_mc32_neon, export=1
1543
        push            {r0, r1, r4-r5, r9-r11, lr}
1544
        add             r1,  r1,  #1
1545
        b               \type\()_h264_qpel16_mc12
1546
endfunc
1547

    
1548
function ff_\type\()_h264_qpel16_mc03_neon, export=1
1549
        push            {r4, lr}
1550
        add             ip,  r1,  r2
1551
        b               \type\()_h264_qpel16_mc01
1552
endfunc
1553

    
1554
function ff_\type\()_h264_qpel16_mc13_neon, export=1
1555
        push            {r0, r1, r4, r11, lr}
1556
        add             r1,  r1,  r2
1557
        b               \type\()_h264_qpel16_mc11
1558
endfunc
1559

    
1560
function ff_\type\()_h264_qpel16_mc23_neon, export=1
1561
        push            {r0, r1, r4-r5, r9-r11, lr}
1562
        add             r1,  r1,  r2
1563
        b               \type\()_h264_qpel16_mc21
1564
endfunc
1565

    
1566
function ff_\type\()_h264_qpel16_mc33_neon, export=1
1567
        add             r1,  r1,  #1
1568
        push            {r0, r1, r4, r11, lr}
1569
        add             r1,  r1,  r2
1570
        sub             r1,  r1,  #1
1571
        b               \type\()_h264_qpel16_mc11
1572
endfunc
1573
        .endm
1574

    
1575
        h264_qpel16 put
1576
        h264_qpel16 avg
1577

    
1578
@ Biweighted prediction
1579

    
1580
        .macro  biweight_16 macs, macd
1581
        vdup.8          d0,  r4
1582
        vdup.8          d1,  r5
1583
        vmov            q2,  q8
1584
        vmov            q3,  q8
1585
1:      subs            ip,  ip,  #2
1586
        vld1.8          {d20-d21},[r0,:128], r2
1587
        \macd           q2,  d0,  d20
1588
        pld             [r0]
1589
        \macd           q3,  d0,  d21
1590
        vld1.8          {d22-d23},[r1,:128], r2
1591
        \macs           q2,  d1,  d22
1592
        pld             [r1]
1593
        \macs           q3,  d1,  d23
1594
        vmov            q12, q8
1595
        vld1.8          {d28-d29},[r0,:128], r2
1596
        vmov            q13, q8
1597
        \macd           q12, d0,  d28
1598
        pld             [r0]
1599
        \macd           q13, d0,  d29
1600
        vld1.8          {d30-d31},[r1,:128], r2
1601
        \macs           q12, d1,  d30
1602
        pld             [r1]
1603
        \macs           q13, d1,  d31
1604
        vshl.s16        q2,  q2,  q9
1605
        vshl.s16        q3,  q3,  q9
1606
        vqmovun.s16     d4,  q2
1607
        vqmovun.s16     d5,  q3
1608
        vshl.s16        q12, q12, q9
1609
        vshl.s16        q13, q13, q9
1610
        vqmovun.s16     d24, q12
1611
        vqmovun.s16     d25, q13
1612
        vmov            q3,  q8
1613
        vst1.8          {d4- d5}, [r6,:128], r2
1614
        vmov            q2,  q8
1615
        vst1.8          {d24-d25},[r6,:128], r2
1616
        bne             1b
1617
        pop             {r4-r6, pc}
1618
        .endm
1619

    
1620
        .macro  biweight_8 macs, macd
1621
        vdup.8          d0,  r4
1622
        vdup.8          d1,  r5
1623
        vmov            q1,  q8
1624
        vmov            q10, q8
1625
1:      subs            ip,  ip,  #2
1626
        vld1.8          {d4},[r0,:64], r2
1627
        \macd           q1,  d0,  d4
1628
        pld             [r0]
1629
        vld1.8          {d5},[r1,:64], r2
1630
        \macs           q1,  d1,  d5
1631
        pld             [r1]
1632
        vld1.8          {d6},[r0,:64], r2
1633
        \macd           q10, d0,  d6
1634
        pld             [r0]
1635
        vld1.8          {d7},[r1,:64], r2
1636
        \macs           q10, d1,  d7
1637
        pld             [r1]
1638
        vshl.s16        q1,  q1,  q9
1639
        vqmovun.s16     d2,  q1
1640
        vshl.s16        q10, q10, q9
1641
        vqmovun.s16     d4,  q10
1642
        vmov            q10, q8
1643
        vst1.8          {d2},[r6,:64], r2
1644
        vmov            q1,  q8
1645
        vst1.8          {d4},[r6,:64], r2
1646
        bne             1b
1647
        pop             {r4-r6, pc}
1648
        .endm
1649

    
1650
        .macro  biweight_4 macs, macd
1651
        vdup.8          d0,  r4
1652
        vdup.8          d1,  r5
1653
        vmov            q1,  q8
1654
        vmov            q10, q8
1655
1:      subs            ip,  ip,  #4
1656
        vld1.32         {d4[0]},[r0,:32], r2
1657
        vld1.32         {d4[1]},[r0,:32], r2
1658
        \macd           q1,  d0,  d4
1659
        pld             [r0]
1660
        vld1.32         {d5[0]},[r1,:32], r2
1661
        vld1.32         {d5[1]},[r1,:32], r2
1662
        \macs           q1,  d1,  d5
1663
        pld             [r1]
1664
        blt             2f
1665
        vld1.32         {d6[0]},[r0,:32], r2
1666
        vld1.32         {d6[1]},[r0,:32], r2
1667
        \macd           q10, d0,  d6
1668
        pld             [r0]
1669
        vld1.32         {d7[0]},[r1,:32], r2
1670
        vld1.32         {d7[1]},[r1,:32], r2
1671
        \macs           q10, d1,  d7
1672
        pld             [r1]
1673
        vshl.s16        q1,  q1,  q9
1674
        vqmovun.s16     d2,  q1
1675
        vshl.s16        q10, q10, q9
1676
        vqmovun.s16     d4,  q10
1677
        vmov            q10, q8
1678
        vst1.32         {d2[0]},[r6,:32], r2
1679
        vst1.32         {d2[1]},[r6,:32], r2
1680
        vmov            q1,  q8
1681
        vst1.32         {d4[0]},[r6,:32], r2
1682
        vst1.32         {d4[1]},[r6,:32], r2
1683
        bne             1b
1684
        pop             {r4-r6, pc}
1685
2:      vshl.s16        q1,  q1,  q9
1686
        vqmovun.s16     d2,  q1
1687
        vst1.32         {d2[0]},[r6,:32], r2
1688
        vst1.32         {d2[1]},[r6,:32], r2
1689
        pop             {r4-r6, pc}
1690
        .endm
1691

    
1692
        .macro  biweight_func w
1693
function biweight_h264_pixels_\w\()_neon
1694
        push            {r4-r6, lr}
1695
        add             r4,  sp,  #16
1696
        ldm             r4,  {r4-r6}
1697
        lsr             lr,  r4,  #31
1698
        add             r6,  r6,  #1
1699
        eors            lr,  lr,  r5,  lsr #30
1700
        orr             r6,  r6,  #1
1701
        vdup.16         q9,  r3
1702
        lsl             r6,  r6,  r3
1703
        vmvn            q9,  q9
1704
        vdup.16         q8,  r6
1705
        mov             r6,  r0
1706
        beq             10f
1707
        subs            lr,  lr,  #1
1708
        beq             20f
1709
        subs            lr,  lr,  #1
1710
        beq             30f
1711
        b               40f
1712
10:     biweight_\w     vmlal.u8, vmlal.u8
1713
20:     rsb             r4,  r4,  #0
1714
        biweight_\w     vmlal.u8, vmlsl.u8
1715
30:     rsb             r4,  r4,  #0
1716
        rsb             r5,  r5,  #0
1717
        biweight_\w     vmlsl.u8, vmlsl.u8
1718
40:     rsb             r5,  r5,  #0
1719
        biweight_\w     vmlsl.u8, vmlal.u8
1720
endfunc
1721
        .endm
1722

    
1723
        .macro  biweight_entry w, h, b=1
1724
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1725
        mov             ip,  #\h
1726
.if \b
1727
        b               biweight_h264_pixels_\w\()_neon
1728
.endif
1729
endfunc
1730
        .endm
1731

    
1732
        biweight_entry  16, 8
1733
        biweight_entry  16, 16, b=0
1734
        biweight_func   16
1735

    
1736
        biweight_entry  8,  16
1737
        biweight_entry  8,  4
1738
        biweight_entry  8,  8,  b=0
1739
        biweight_func   8
1740

    
1741
        biweight_entry  4,  8
1742
        biweight_entry  4,  2
1743
        biweight_entry  4,  4,  b=0
1744
        biweight_func   4
1745

    
1746
@ Weighted prediction
1747

    
1748
        .macro  weight_16 add
1749
        vdup.8          d0,  r3
1750
1:      subs            ip,  ip,  #2
1751
        vld1.8          {d20-d21},[r0,:128], r1
1752
        vmull.u8        q2,  d0,  d20
1753
        pld             [r0]
1754
        vmull.u8        q3,  d0,  d21
1755
        vld1.8          {d28-d29},[r0,:128], r1
1756
        vmull.u8        q12, d0,  d28
1757
        pld             [r0]
1758
        vmull.u8        q13, d0,  d29
1759
        \add            q2,  q8,  q2
1760
        vrshl.s16       q2,  q2,  q9
1761
        \add            q3,  q8,  q3
1762
        vrshl.s16       q3,  q3,  q9
1763
        vqmovun.s16     d4,  q2
1764
        vqmovun.s16     d5,  q3
1765
        \add            q12, q8,  q12
1766
        vrshl.s16       q12, q12, q9
1767
        \add            q13, q8,  q13
1768
        vrshl.s16       q13, q13, q9
1769
        vqmovun.s16     d24, q12
1770
        vqmovun.s16     d25, q13
1771
        vst1.8          {d4- d5}, [r4,:128], r1
1772
        vst1.8          {d24-d25},[r4,:128], r1
1773
        bne             1b
1774
        pop             {r4, pc}
1775
        .endm
1776

    
1777
        .macro  weight_8 add
1778
        vdup.8          d0,  r3
1779
1:      subs            ip,  ip,  #2
1780
        vld1.8          {d4},[r0,:64], r1
1781
        vmull.u8        q1,  d0,  d4
1782
        pld             [r0]
1783
        vld1.8          {d6},[r0,:64], r1
1784
        vmull.u8        q10, d0,  d6
1785
        \add            q1,  q8,  q1
1786
        pld             [r0]
1787
        vrshl.s16       q1,  q1,  q9
1788
        vqmovun.s16     d2,  q1
1789
        \add            q10, q8,  q10
1790
        vrshl.s16       q10, q10, q9
1791
        vqmovun.s16     d4,  q10
1792
        vst1.8          {d2},[r4,:64], r1
1793
        vst1.8          {d4},[r4,:64], r1
1794
        bne             1b
1795
        pop             {r4, pc}
1796
        .endm
1797

    
1798
        .macro  weight_4 add
1799
        vdup.8          d0,  r3
1800
        vmov            q1,  q8
1801
        vmov            q10, q8
1802
1:      subs            ip,  ip,  #4
1803
        vld1.32         {d4[0]},[r0,:32], r1
1804
        vld1.32         {d4[1]},[r0,:32], r1
1805
        vmull.u8        q1,  d0,  d4
1806
        pld             [r0]
1807
        blt             2f
1808
        vld1.32         {d6[0]},[r0,:32], r1
1809
        vld1.32         {d6[1]},[r0,:32], r1
1810
        vmull.u8        q10, d0,  d6
1811
        pld             [r0]
1812
        \add            q1,  q8,  q1
1813
        vrshl.s16       q1,  q1,  q9
1814
        vqmovun.s16     d2,  q1
1815
        \add            q10, q8,  q10
1816
        vrshl.s16       q10, q10, q9
1817
        vqmovun.s16     d4,  q10
1818
        vmov            q10, q8
1819
        vst1.32         {d2[0]},[r4,:32], r1
1820
        vst1.32         {d2[1]},[r4,:32], r1
1821
        vmov            q1,  q8
1822
        vst1.32         {d4[0]},[r4,:32], r1
1823
        vst1.32         {d4[1]},[r4,:32], r1
1824
        bne             1b
1825
        pop             {r4, pc}
1826
2:      \add            q1,  q8,  q1
1827
        vrshl.s16       q1,  q1,  q9
1828
        vqmovun.s16     d2,  q1
1829
        vst1.32         {d2[0]},[r4,:32], r1
1830
        vst1.32         {d2[1]},[r4,:32], r1
1831
        pop             {r4, pc}
1832
        .endm
1833

    
1834
        .macro  weight_func w
1835
function weight_h264_pixels_\w\()_neon
1836
        push            {r4, lr}
1837
        ldr             r4,  [sp, #8]
1838
        cmp             r2,  #1
1839
        lsl             r4,  r4,  r2
1840
        vdup.16         q8,  r4
1841
        mov             r4,  r0
1842
        ble             20f
1843
        rsb             lr,  r2,  #1
1844
        vdup.16         q9,  lr
1845
        cmp             r3,  #0
1846
        blt             10f
1847
        weight_\w       vhadd.s16
1848
10:     rsb             r3,  r3,  #0
1849
        weight_\w       vhsub.s16
1850
20:     rsb             lr,  r2,  #0
1851
        vdup.16         q9,  lr
1852
        cmp             r3,  #0
1853
        blt             10f
1854
        weight_\w       vadd.s16
1855
10:     rsb             r3,  r3,  #0
1856
        weight_\w       vsub.s16
1857
endfunc
1858
        .endm
1859

    
1860
        .macro  weight_entry w, h, b=1
1861
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1862
        mov             ip,  #\h
1863
.if \b
1864
        b               weight_h264_pixels_\w\()_neon
1865
.endif
1866
endfunc
1867
        .endm
1868

    
1869
        weight_entry    16, 8
1870
        weight_entry    16, 16, b=0
1871
        weight_func     16
1872

    
1873
        weight_entry    8,  16
1874
        weight_entry    8,  4
1875
        weight_entry    8,  8,  b=0
1876
        weight_func     8
1877

    
1878
        weight_entry    4,  8
1879
        weight_entry    4,  2
1880
        weight_entry    4,  4,  b=0
1881
        weight_func     4