Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264dsp_neon.S @ 04e7f6d2

History | View | Annotate | Download (60 KB)

1
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "asm.S"
22

    
23
        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24
        vtrn.32         \r0, \r4
25
        vtrn.32         \r1, \r5
26
        vtrn.32         \r2, \r6
27
        vtrn.32         \r3, \r7
28
        vtrn.16         \r0, \r2
29
        vtrn.16         \r1, \r3
30
        vtrn.16         \r4, \r6
31
        vtrn.16         \r5, \r7
32
        vtrn.8          \r0, \r1
33
        vtrn.8          \r2, \r3
34
        vtrn.8          \r4, \r5
35
        vtrn.8          \r6, \r7
36
        .endm
37

    
38
        .macro transpose_4x4 r0 r1 r2 r3
39
        vtrn.16         \r0, \r2
40
        vtrn.16         \r1, \r3
41
        vtrn.8          \r0, \r1
42
        vtrn.8          \r2, \r3
43
        .endm
44

    
45
        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46
        vswp            \r0, \r4
47
        vswp            \r1, \r5
48
        vswp            \r2, \r6
49
        vswp            \r3, \r7
50
        .endm
51

    
52
        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53
        vtrn.32         \r0, \r2
54
        vtrn.32         \r1, \r3
55
        vtrn.32         \r4, \r6
56
        vtrn.32         \r5, \r7
57
        vtrn.16         \r0, \r1
58
        vtrn.16         \r2, \r3
59
        vtrn.16         \r4, \r5
60
        vtrn.16         \r6, \r7
61
        .endm
62

    
63
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64
        .macro  h264_chroma_mc8 type
65
function ff_\type\()_h264_chroma_mc8_neon, export=1
66
        push            {r4-r7, lr}
67
        ldrd            r4,  [sp, #20]
68
.ifc \type,avg
69
        mov             lr,  r0
70
.endif
71
        pld             [r1]
72
        pld             [r1, r2]
73

    
74
        muls            r7,  r4,  r5
75
        rsb             r6,  r7,  r5,  lsl #3
76
        rsb             ip,  r7,  r4,  lsl #3
77
        sub             r4,  r7,  r4,  lsl #3
78
        sub             r4,  r4,  r5,  lsl #3
79
        add             r4,  r4,  #64
80

    
81
        beq             2f
82

    
83
        add             r5,  r1,  r2
84

    
85
        vdup.8          d0,  r4
86
        lsl             r4,  r2,  #1
87
        vdup.8          d1,  ip
88
        vld1.64         {d4, d5}, [r1], r4
89
        vdup.8          d2,  r6
90
        vld1.64         {d6, d7}, [r5], r4
91
        vdup.8          d3,  r7
92

    
93
        vext.8          d5,  d4,  d5,  #1
94
        vext.8          d7,  d6,  d7,  #1
95

    
96
1:      pld             [r5]
97
        vmull.u8        q8,  d4,  d0
98
        vmlal.u8        q8,  d5,  d1
99
        vld1.64         {d4, d5}, [r1], r4
100
        vmlal.u8        q8,  d6,  d2
101
        vext.8          d5,  d4,  d5,  #1
102
        vmlal.u8        q8,  d7,  d3
103
        vmull.u8        q9,  d6,  d0
104
        subs            r3,  r3,  #2
105
        vmlal.u8        q9,  d7,  d1
106
        vmlal.u8        q9,  d4,  d2
107
        vmlal.u8        q9,  d5,  d3
108
        vrshrn.u16      d16, q8,  #6
109
        vld1.64         {d6, d7}, [r5], r4
110
        pld             [r1]
111
        vrshrn.u16      d17, q9,  #6
112
.ifc \type,avg
113
        vld1.64         {d20}, [lr,:64], r2
114
        vld1.64         {d21}, [lr,:64], r2
115
        vrhadd.u8       q8,  q8,  q10
116
.endif
117
        vext.8          d7,  d6,  d7,  #1
118
        vst1.64         {d16}, [r0,:64], r2
119
        vst1.64         {d17}, [r0,:64], r2
120
        bgt             1b
121

    
122
        pop             {r4-r7, pc}
123

    
124
2:      tst             r6,  r6
125
        add             ip,  ip,  r6
126
        vdup.8          d0,  r4
127
        vdup.8          d1,  ip
128

    
129
        beq             4f
130

    
131
        add             r5,  r1,  r2
132
        lsl             r4,  r2,  #1
133
        vld1.64         {d4}, [r1], r4
134
        vld1.64         {d6}, [r5], r4
135

    
136
3:      pld             [r5]
137
        vmull.u8        q8,  d4,  d0
138
        vmlal.u8        q8,  d6,  d1
139
        vld1.64         {d4}, [r1], r4
140
        vmull.u8        q9,  d6,  d0
141
        vmlal.u8        q9,  d4,  d1
142
        vld1.64         {d6}, [r5], r4
143
        vrshrn.u16      d16, q8,  #6
144
        vrshrn.u16      d17, q9,  #6
145
.ifc \type,avg
146
        vld1.64         {d20}, [lr,:64], r2
147
        vld1.64         {d21}, [lr,:64], r2
148
        vrhadd.u8       q8,  q8,  q10
149
.endif
150
        subs            r3,  r3,  #2
151
        pld             [r1]
152
        vst1.64         {d16}, [r0,:64], r2
153
        vst1.64         {d17}, [r0,:64], r2
154
        bgt             3b
155

    
156
        pop             {r4-r7, pc}
157

    
158
4:      vld1.64         {d4, d5}, [r1], r2
159
        vld1.64         {d6, d7}, [r1], r2
160
        vext.8          d5,  d4,  d5,  #1
161
        vext.8          d7,  d6,  d7,  #1
162

    
163
5:      pld             [r1]
164
        subs            r3,  r3,  #2
165
        vmull.u8        q8,  d4,  d0
166
        vmlal.u8        q8,  d5,  d1
167
        vld1.64         {d4, d5}, [r1], r2
168
        vmull.u8        q9,  d6,  d0
169
        vmlal.u8        q9,  d7,  d1
170
        pld             [r1]
171
        vext.8          d5,  d4,  d5,  #1
172
        vrshrn.u16      d16, q8,  #6
173
        vrshrn.u16      d17, q9,  #6
174
.ifc \type,avg
175
        vld1.64         {d20}, [lr,:64], r2
176
        vld1.64         {d21}, [lr,:64], r2
177
        vrhadd.u8       q8,  q8,  q10
178
.endif
179
        vld1.64         {d6, d7}, [r1], r2
180
        vext.8          d7,  d6,  d7,  #1
181
        vst1.64         {d16}, [r0,:64], r2
182
        vst1.64         {d17}, [r0,:64], r2
183
        bgt             5b
184

    
185
        pop             {r4-r7, pc}
186
        .endfunc
187
        .endm
188

    
189
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190
        .macro  h264_chroma_mc4 type
191
function ff_\type\()_h264_chroma_mc4_neon, export=1
192
        push            {r4-r7, lr}
193
        ldrd            r4,  [sp, #20]
194
.ifc \type,avg
195
        mov             lr,  r0
196
.endif
197
        pld             [r1]
198
        pld             [r1, r2]
199

    
200
        muls            r7,  r4,  r5
201
        rsb             r6,  r7,  r5,  lsl #3
202
        rsb             ip,  r7,  r4,  lsl #3
203
        sub             r4,  r7,  r4,  lsl #3
204
        sub             r4,  r4,  r5,  lsl #3
205
        add             r4,  r4,  #64
206

    
207
        beq             2f
208

    
209
        add             r5,  r1,  r2
210

    
211
        vdup.8          d0,  r4
212
        lsl             r4,  r2,  #1
213
        vdup.8          d1,  ip
214
        vld1.64         {d4},     [r1], r4
215
        vdup.8          d2,  r6
216
        vld1.64         {d6},     [r5], r4
217
        vdup.8          d3,  r7
218

    
219
        vext.8          d5,  d4,  d5,  #1
220
        vext.8          d7,  d6,  d7,  #1
221
        vtrn.32         d4,  d5
222
        vtrn.32         d6,  d7
223

    
224
        vtrn.32         d0,  d1
225
        vtrn.32         d2,  d3
226

    
227
1:      pld             [r5]
228
        vmull.u8        q8,  d4,  d0
229
        vmlal.u8        q8,  d6,  d2
230
        vld1.64         {d4},     [r1], r4
231
        vext.8          d5,  d4,  d5,  #1
232
        vtrn.32         d4,  d5
233
        vmull.u8        q9,  d6,  d0
234
        vmlal.u8        q9,  d4,  d2
235
        vld1.64         {d6},     [r5], r4
236
        vadd.i16        d16, d16, d17
237
        vadd.i16        d17, d18, d19
238
        vrshrn.u16      d16, q8,  #6
239
        subs            r3,  r3,  #2
240
        pld             [r1]
241
.ifc \type,avg
242
        vld1.32         {d20[0]}, [lr,:32], r2
243
        vld1.32         {d20[1]}, [lr,:32], r2
244
        vrhadd.u8       d16, d16, d20
245
.endif
246
        vext.8          d7,  d6,  d7,  #1
247
        vtrn.32         d6,  d7
248
        vst1.32         {d16[0]}, [r0,:32], r2
249
        vst1.32         {d16[1]}, [r0,:32], r2
250
        bgt             1b
251

    
252
        pop             {r4-r7, pc}
253

    
254
2:      tst             r6,  r6
255
        add             ip,  ip,  r6
256
        vdup.8          d0,  r4
257
        vdup.8          d1,  ip
258
        vtrn.32         d0,  d1
259

    
260
        beq             4f
261

    
262
        vext.32         d1,  d0,  d1,  #1
263
        add             r5,  r1,  r2
264
        lsl             r4,  r2,  #1
265
        vld1.32         {d4[0]},  [r1], r4
266
        vld1.32         {d4[1]},  [r5], r4
267

    
268
3:      pld             [r5]
269
        vmull.u8        q8,  d4,  d0
270
        vld1.32         {d4[0]},  [r1], r4
271
        vmull.u8        q9,  d4,  d1
272
        vld1.32         {d4[1]},  [r5], r4
273
        vadd.i16        d16, d16, d17
274
        vadd.i16        d17, d18, d19
275
        vrshrn.u16      d16, q8,  #6
276
.ifc \type,avg
277
        vld1.32         {d20[0]}, [lr,:32], r2
278
        vld1.32         {d20[1]}, [lr,:32], r2
279
        vrhadd.u8       d16, d16, d20
280
.endif
281
        subs            r3,  r3,  #2
282
        pld             [r1]
283
        vst1.32         {d16[0]}, [r0,:32], r2
284
        vst1.32         {d16[1]}, [r0,:32], r2
285
        bgt             3b
286

    
287
        pop             {r4-r7, pc}
288

    
289
4:      vld1.64         {d4},     [r1], r2
290
        vld1.64         {d6},     [r1], r2
291
        vext.8          d5,  d4,  d5,  #1
292
        vext.8          d7,  d6,  d7,  #1
293
        vtrn.32         d4,  d5
294
        vtrn.32         d6,  d7
295

    
296
5:      vmull.u8        q8,  d4,  d0
297
        vmull.u8        q9,  d6,  d0
298
        subs            r3,  r3,  #2
299
        vld1.64         {d4},     [r1], r2
300
        vext.8          d5,  d4,  d5,  #1
301
        vtrn.32         d4,  d5
302
        vadd.i16        d16, d16, d17
303
        vadd.i16        d17, d18, d19
304
        pld             [r1]
305
        vrshrn.u16      d16, q8,  #6
306
.ifc \type,avg
307
        vld1.32         {d20[0]}, [lr,:32], r2
308
        vld1.32         {d20[1]}, [lr,:32], r2
309
        vrhadd.u8       d16, d16, d20
310
.endif
311
        vld1.64         {d6},     [r1], r2
312
        vext.8          d7,  d6,  d7,  #1
313
        vtrn.32         d6,  d7
314
        pld             [r1]
315
        vst1.32         {d16[0]}, [r0,:32], r2
316
        vst1.32         {d16[1]}, [r0,:32], r2
317
        bgt             5b
318

    
319
        pop             {r4-r7, pc}
320
        .endfunc
321
        .endm
322

    
323
        .text
324
        .align
325

    
326
        h264_chroma_mc8 put
327
        h264_chroma_mc8 avg
328
        h264_chroma_mc4 put
329
        h264_chroma_mc4 avg
330

    
331
        /* H.264 loop filter */
332

    
333
        .macro h264_loop_filter_start
334
        ldr             ip,  [sp]
335
        tst             r2,  r2
336
        ldr             ip,  [ip]
337
        tstne           r3,  r3
338
        vmov.32         d24[0], ip
339
        and             ip,  ip,  ip, lsl #16
340
        bxeq            lr
341
        ands            ip,  ip,  ip, lsl #8
342
        bxlt            lr
343
        .endm
344

    
345
        .macro align_push_regs
346
        and             ip,  sp,  #15
347
        add             ip,  ip,  #32
348
        sub             sp,  sp,  ip
349
        vst1.64         {d12-d15}, [sp,:128]
350
        sub             sp,  sp,  #32
351
        vst1.64         {d8-d11},  [sp,:128]
352
        .endm
353

    
354
        .macro align_pop_regs
355
        vld1.64         {d8-d11},  [sp,:128]!
356
        vld1.64         {d12-d15}, [sp,:128], ip
357
        .endm
358

    
359
        .macro h264_loop_filter_luma
360
        vdup.8          q11, r2         @ alpha
361
        vmovl.u8        q12, d24
362
        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
363
        vmovl.u16       q12, d24
364
        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
365
        vsli.16         q12, q12, #8
366
        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
367
        vsli.32         q12, q12, #16
368
        vclt.u8         q6,  q6,  q11   @ < alpha
369
        vdup.8          q11, r3         @ beta
370
        vclt.s8         q7,  q12, #0
371
        vclt.u8         q14, q14, q11   @ < beta
372
        vclt.u8         q15, q15, q11   @ < beta
373
        vbic            q6,  q6,  q7
374
        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
375
        vand            q6,  q6,  q14
376
        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
377
        vclt.u8         q4,  q4,  q11   @ < beta
378
        vand            q6,  q6,  q15
379
        vclt.u8         q5,  q5,  q11   @ < beta
380
        vand            q4,  q4,  q6
381
        vand            q5,  q5,  q6
382
        vand            q12, q12, q6
383
        vrhadd.u8       q14, q8,  q0
384
        vsub.i8         q6,  q12, q4
385
        vqadd.u8        q7,  q9,  q12
386
        vhadd.u8        q10, q10, q14
387
        vsub.i8         q6,  q6,  q5
388
        vhadd.u8        q14, q2,  q14
389
        vmin.u8         q7,  q7,  q10
390
        vqsub.u8        q11, q9,  q12
391
        vqadd.u8        q2,  q1,  q12
392
        vmax.u8         q7,  q7,  q11
393
        vqsub.u8        q11, q1,  q12
394
        vmin.u8         q14, q2,  q14
395
        vmovl.u8        q2,  d0
396
        vmax.u8         q14, q14, q11
397
        vmovl.u8        q10, d1
398
        vsubw.u8        q2,  q2,  d16
399
        vsubw.u8        q10, q10, d17
400
        vshl.i16        q2,  q2,  #2
401
        vshl.i16        q10, q10, #2
402
        vaddw.u8        q2,  q2,  d18
403
        vaddw.u8        q10, q10, d19
404
        vsubw.u8        q2,  q2,  d2
405
        vsubw.u8        q10, q10, d3
406
        vrshrn.i16      d4,  q2,  #3
407
        vrshrn.i16      d5,  q10, #3
408
        vbsl            q4,  q7,  q9
409
        vbsl            q5,  q14, q1
410
        vneg.s8         q7,  q6
411
        vmovl.u8        q14, d16
412
        vmin.s8         q2,  q2,  q6
413
        vmovl.u8        q6,  d17
414
        vmax.s8         q2,  q2,  q7
415
        vmovl.u8        q11, d0
416
        vmovl.u8        q12, d1
417
        vaddw.s8        q14, q14, d4
418
        vaddw.s8        q6,  q6,  d5
419
        vsubw.s8        q11, q11, d4
420
        vsubw.s8        q12, q12, d5
421
        vqmovun.s16     d16, q14
422
        vqmovun.s16     d17, q6
423
        vqmovun.s16     d0,  q11
424
        vqmovun.s16     d1,  q12
425
        .endm
426

    
427
function ff_h264_v_loop_filter_luma_neon, export=1
428
        h264_loop_filter_start
429

    
430
        vld1.64         {d0, d1},  [r0,:128], r1
431
        vld1.64         {d2, d3},  [r0,:128], r1
432
        vld1.64         {d4, d5},  [r0,:128], r1
433
        sub             r0,  r0,  r1, lsl #2
434
        sub             r0,  r0,  r1, lsl #1
435
        vld1.64         {d20,d21}, [r0,:128], r1
436
        vld1.64         {d18,d19}, [r0,:128], r1
437
        vld1.64         {d16,d17}, [r0,:128], r1
438

    
439
        align_push_regs
440

    
441
        h264_loop_filter_luma
442

    
443
        sub             r0,  r0,  r1, lsl #1
444
        vst1.64         {d8, d9},  [r0,:128], r1
445
        vst1.64         {d16,d17}, [r0,:128], r1
446
        vst1.64         {d0, d1},  [r0,:128], r1
447
        vst1.64         {d10,d11}, [r0,:128]
448

    
449
        align_pop_regs
450
        bx              lr
451
        .endfunc
452

    
453
function ff_h264_h_loop_filter_luma_neon, export=1
454
        h264_loop_filter_start
455

    
456
        sub             r0,  r0,  #4
457
        vld1.64         {d6},  [r0], r1
458
        vld1.64         {d20}, [r0], r1
459
        vld1.64         {d18}, [r0], r1
460
        vld1.64         {d16}, [r0], r1
461
        vld1.64         {d0},  [r0], r1
462
        vld1.64         {d2},  [r0], r1
463
        vld1.64         {d4},  [r0], r1
464
        vld1.64         {d26}, [r0], r1
465
        vld1.64         {d7},  [r0], r1
466
        vld1.64         {d21}, [r0], r1
467
        vld1.64         {d19}, [r0], r1
468
        vld1.64         {d17}, [r0], r1
469
        vld1.64         {d1},  [r0], r1
470
        vld1.64         {d3},  [r0], r1
471
        vld1.64         {d5},  [r0], r1
472
        vld1.64         {d27}, [r0], r1
473

    
474
        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
475

    
476
        align_push_regs
477

    
478
        h264_loop_filter_luma
479

    
480
        transpose_4x4   q4, q8, q0, q5
481

    
482
        sub             r0,  r0,  r1, lsl #4
483
        add             r0,  r0,  #2
484
        vst1.32         {d8[0]},  [r0], r1
485
        vst1.32         {d16[0]}, [r0], r1
486
        vst1.32         {d0[0]},  [r0], r1
487
        vst1.32         {d10[0]}, [r0], r1
488
        vst1.32         {d8[1]},  [r0], r1
489
        vst1.32         {d16[1]}, [r0], r1
490
        vst1.32         {d0[1]},  [r0], r1
491
        vst1.32         {d10[1]}, [r0], r1
492
        vst1.32         {d9[0]},  [r0], r1
493
        vst1.32         {d17[0]}, [r0], r1
494
        vst1.32         {d1[0]},  [r0], r1
495
        vst1.32         {d11[0]}, [r0], r1
496
        vst1.32         {d9[1]},  [r0], r1
497
        vst1.32         {d17[1]}, [r0], r1
498
        vst1.32         {d1[1]},  [r0], r1
499
        vst1.32         {d11[1]}, [r0], r1
500

    
501
        align_pop_regs
502
        bx              lr
503
        .endfunc
504

    
505
        .macro h264_loop_filter_chroma
506
        vdup.8          d22, r2         @ alpha
507
        vmovl.u8        q12, d24
508
        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
509
        vmovl.u8        q2,  d0
510
        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
511
        vsubw.u8        q2,  q2,  d16
512
        vsli.16         d24, d24, #8
513
        vshl.i16        q2,  q2,  #2
514
        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
515
        vaddw.u8        q2,  q2,  d18
516
        vclt.u8         d26, d26, d22   @ < alpha
517
        vsubw.u8        q2,  q2,  d2
518
        vdup.8          d22, r3         @ beta
519
        vclt.s8         d25, d24, #0
520
        vrshrn.i16      d4,  q2,  #3
521
        vclt.u8         d28, d28, d22   @ < beta
522
        vbic            d26, d26, d25
523
        vclt.u8         d30, d30, d22   @ < beta
524
        vand            d26, d26, d28
525
        vneg.s8         d25, d24
526
        vand            d26, d26, d30
527
        vmin.s8         d4,  d4,  d24
528
        vmovl.u8        q14, d16
529
        vand            d4,  d4,  d26
530
        vmax.s8         d4,  d4,  d25
531
        vmovl.u8        q11, d0
532
        vaddw.s8        q14, q14, d4
533
        vsubw.s8        q11, q11, d4
534
        vqmovun.s16     d16, q14
535
        vqmovun.s16     d0,  q11
536
        .endm
537

    
538
function ff_h264_v_loop_filter_chroma_neon, export=1
539
        h264_loop_filter_start
540

    
541
        sub             r0,  r0,  r1, lsl #1
542
        vld1.64         {d18}, [r0,:64], r1
543
        vld1.64         {d16}, [r0,:64], r1
544
        vld1.64         {d0},  [r0,:64], r1
545
        vld1.64         {d2},  [r0,:64]
546

    
547
        h264_loop_filter_chroma
548

    
549
        sub             r0,  r0,  r1, lsl #1
550
        vst1.64         {d16}, [r0,:64], r1
551
        vst1.64         {d0},  [r0,:64], r1
552

    
553
        bx              lr
554
        .endfunc
555

    
556
function ff_h264_h_loop_filter_chroma_neon, export=1
557
        h264_loop_filter_start
558

    
559
        sub             r0,  r0,  #2
560
        vld1.32         {d18[0]}, [r0], r1
561
        vld1.32         {d16[0]}, [r0], r1
562
        vld1.32         {d0[0]},  [r0], r1
563
        vld1.32         {d2[0]},  [r0], r1
564
        vld1.32         {d18[1]}, [r0], r1
565
        vld1.32         {d16[1]}, [r0], r1
566
        vld1.32         {d0[1]},  [r0], r1
567
        vld1.32         {d2[1]},  [r0], r1
568

    
569
        vtrn.16         d18, d0
570
        vtrn.16         d16, d2
571
        vtrn.8          d18, d16
572
        vtrn.8          d0,  d2
573

    
574
        h264_loop_filter_chroma
575

    
576
        vtrn.16         d18, d0
577
        vtrn.16         d16, d2
578
        vtrn.8          d18, d16
579
        vtrn.8          d0,  d2
580

    
581
        sub             r0,  r0,  r1, lsl #3
582
        vst1.32         {d18[0]}, [r0], r1
583
        vst1.32         {d16[0]}, [r0], r1
584
        vst1.32         {d0[0]},  [r0], r1
585
        vst1.32         {d2[0]},  [r0], r1
586
        vst1.32         {d18[1]}, [r0], r1
587
        vst1.32         {d16[1]}, [r0], r1
588
        vst1.32         {d0[1]},  [r0], r1
589
        vst1.32         {d2[1]},  [r0], r1
590

    
591
        bx              lr
592
        .endfunc
593

    
594
        /* H.264 qpel MC */
595

    
596
        .macro  lowpass_const r
597
        movw            \r,  #5
598
        movt            \r,  #20
599
        vmov.32         d6[0], \r
600
        .endm
601

    
602
        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
603
.if \narrow
604
        t0 .req q0
605
        t1 .req q8
606
.else
607
        t0 .req \d0
608
        t1 .req \d1
609
.endif
610
        vext.8          d2,  \r0, \r1, #2
611
        vext.8          d3,  \r0, \r1, #3
612
        vaddl.u8        q1,  d2,  d3
613
        vext.8          d4,  \r0, \r1, #1
614
        vext.8          d5,  \r0, \r1, #4
615
        vaddl.u8        q2,  d4,  d5
616
        vext.8          d30, \r0, \r1, #5
617
        vaddl.u8        t0,  \r0, d30
618
        vext.8          d18, \r2, \r3, #2
619
        vmla.i16        t0,  q1,  d6[1]
620
        vext.8          d19, \r2, \r3, #3
621
        vaddl.u8        q9,  d18, d19
622
        vext.8          d20, \r2, \r3, #1
623
        vmls.i16        t0,  q2,  d6[0]
624
        vext.8          d21, \r2, \r3, #4
625
        vaddl.u8        q10, d20, d21
626
        vext.8          d31, \r2, \r3, #5
627
        vaddl.u8        t1,  \r2, d31
628
        vmla.i16        t1,  q9,  d6[1]
629
        vmls.i16        t1,  q10, d6[0]
630
.if \narrow
631
        vqrshrun.s16    \d0, t0,  #5
632
        vqrshrun.s16    \d1, t1,  #5
633
.endif
634
        .unreq  t0
635
        .unreq  t1
636
        .endm
637

    
638
        .macro  lowpass_8_1 r0, r1, d0, narrow=1
639
.if \narrow
640
        t0 .req q0
641
.else
642
        t0 .req \d0
643
.endif
644
        vext.8          d2,  \r0, \r1, #2
645
        vext.8          d3,  \r0, \r1, #3
646
        vaddl.u8        q1,  d2,  d3
647
        vext.8          d4,  \r0, \r1, #1
648
        vext.8          d5,  \r0, \r1, #4
649
        vaddl.u8        q2,  d4,  d5
650
        vext.8          d30, \r0, \r1, #5
651
        vaddl.u8        t0,  \r0, d30
652
        vmla.i16        t0,  q1,  d6[1]
653
        vmls.i16        t0,  q2,  d6[0]
654
.if \narrow
655
        vqrshrun.s16    \d0, t0,  #5
656
.endif
657
        .unreq  t0
658
        .endm
659

    
660
        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661
        vext.16         q1,  \r0, \r1, #2
662
        vext.16         q0,  \r0, \r1, #3
663
        vaddl.s16       q9,  d2,  d0
664
        vext.16         q2,  \r0, \r1, #1
665
        vaddl.s16       q1,  d3,  d1
666
        vext.16         q3,  \r0, \r1, #4
667
        vaddl.s16       q10, d4,  d6
668
        vext.16         \r1, \r0, \r1, #5
669
        vaddl.s16       q2,  d5,  d7
670
        vaddl.s16       q0,  \h0, \h1
671
        vaddl.s16       q8,  \l0, \l1
672

    
673
        vshl.i32        q3,  q9,  #4
674
        vshl.i32        q9,  q9,  #2
675
        vshl.i32        q15, q10, #2
676
        vadd.i32        q9,  q9,  q3
677
        vadd.i32        q10, q10, q15
678

    
679
        vshl.i32        q3,  q1,  #4
680
        vshl.i32        q1,  q1,  #2
681
        vshl.i32        q15, q2,  #2
682
        vadd.i32        q1,  q1,  q3
683
        vadd.i32        q2,  q2,  q15
684

    
685
        vadd.i32        q9,  q9,  q8
686
        vsub.i32        q9,  q9,  q10
687

    
688
        vadd.i32        q1,  q1,  q0
689
        vsub.i32        q1,  q1,  q2
690

    
691
        vrshrn.s32      d18, q9,  #10
692
        vrshrn.s32      d19, q1,  #10
693

    
694
        vqmovun.s16     \d,  q9
695
        .endm
696

    
697
function put_h264_qpel16_h_lowpass_neon_packed
698
        mov             r4,  lr
699
        mov             ip,  #16
700
        mov             r3,  #8
701
        bl              put_h264_qpel8_h_lowpass_neon
702
        sub             r1,  r1,  r2, lsl #4
703
        add             r1,  r1,  #8
704
        mov             ip,  #16
705
        mov             lr,  r4
706
        b               put_h264_qpel8_h_lowpass_neon
707
        .endfunc
708

    
709
        .macro h264_qpel_h_lowpass type
710
function \type\()_h264_qpel16_h_lowpass_neon
711
        push            {lr}
712
        mov             ip,  #16
713
        bl              \type\()_h264_qpel8_h_lowpass_neon
714
        sub             r0,  r0,  r3, lsl #4
715
        sub             r1,  r1,  r2, lsl #4
716
        add             r0,  r0,  #8
717
        add             r1,  r1,  #8
718
        mov             ip,  #16
719
        pop             {lr}
720
        .endfunc
721

    
722
function \type\()_h264_qpel8_h_lowpass_neon
723
1:      vld1.64         {d0, d1},  [r1], r2
724
        vld1.64         {d16,d17}, [r1], r2
725
        subs            ip,  ip,  #2
726
        lowpass_8       d0,  d1,  d16, d17, d0,  d16
727
.ifc \type,avg
728
        vld1.8          {d2},     [r0,:64], r3
729
        vrhadd.u8       d0,  d0,  d2
730
        vld1.8          {d3},     [r0,:64]
731
        vrhadd.u8       d16, d16, d3
732
        sub             r0,  r0,  r3
733
.endif
734
        vst1.64         {d0},     [r0,:64], r3
735
        vst1.64         {d16},    [r0,:64], r3
736
        bne             1b
737
        bx              lr
738
        .endfunc
739
        .endm
740

    
741
        h264_qpel_h_lowpass put
742
        h264_qpel_h_lowpass avg
743

    
744
        .macro h264_qpel_h_lowpass_l2 type
745
function \type\()_h264_qpel16_h_lowpass_l2_neon
746
        push            {lr}
747
        mov             ip,  #16
748
        bl              \type\()_h264_qpel8_h_lowpass_l2_neon
749
        sub             r0,  r0,  r2, lsl #4
750
        sub             r1,  r1,  r2, lsl #4
751
        sub             r3,  r3,  r2, lsl #4
752
        add             r0,  r0,  #8
753
        add             r1,  r1,  #8
754
        add             r3,  r3,  #8
755
        mov             ip,  #16
756
        pop             {lr}
757
        .endfunc
758

    
759
function \type\()_h264_qpel8_h_lowpass_l2_neon
760
1:      vld1.64         {d0, d1},  [r1], r2
761
        vld1.64         {d16,d17}, [r1], r2
762
        vld1.64         {d28},     [r3], r2
763
        vld1.64         {d29},     [r3], r2
764
        subs            ip,  ip,  #2
765
        lowpass_8       d0,  d1,  d16, d17, d0,  d1
766
        vrhadd.u8       q0,  q0,  q14
767
.ifc \type,avg
768
        vld1.8          {d2},      [r0,:64], r2
769
        vrhadd.u8       d0,  d0,  d2
770
        vld1.8          {d3},      [r0,:64]
771
        vrhadd.u8       d1,  d1,  d3
772
        sub             r0,  r0,  r2
773
.endif
774
        vst1.64         {d0},      [r0,:64], r2
775
        vst1.64         {d1},      [r0,:64], r2
776
        bne             1b
777
        bx              lr
778
        .endfunc
779
        .endm
780

    
781
        h264_qpel_h_lowpass_l2 put
782
        h264_qpel_h_lowpass_l2 avg
783

    
784
function put_h264_qpel16_v_lowpass_neon_packed
785
        mov             r4,  lr
786
        mov             r2,  #8
787
        bl              put_h264_qpel8_v_lowpass_neon
788
        sub             r1,  r1,  r3, lsl #2
789
        bl              put_h264_qpel8_v_lowpass_neon
790
        sub             r1,  r1,  r3, lsl #4
791
        sub             r1,  r1,  r3, lsl #2
792
        add             r1,  r1,  #8
793
        bl              put_h264_qpel8_v_lowpass_neon
794
        sub             r1,  r1,  r3, lsl #2
795
        mov             lr,  r4
796
        b               put_h264_qpel8_v_lowpass_neon
797
        .endfunc
798

    
799
        .macro h264_qpel_v_lowpass type
800
function \type\()_h264_qpel16_v_lowpass_neon
801
        mov             r4,  lr
802
        bl              \type\()_h264_qpel8_v_lowpass_neon
803
        sub             r1,  r1,  r3, lsl #2
804
        bl              \type\()_h264_qpel8_v_lowpass_neon
805
        sub             r0,  r0,  r2, lsl #4
806
        add             r0,  r0,  #8
807
        sub             r1,  r1,  r3, lsl #4
808
        sub             r1,  r1,  r3, lsl #2
809
        add             r1,  r1,  #8
810
        bl              \type\()_h264_qpel8_v_lowpass_neon
811
        sub             r1,  r1,  r3, lsl #2
812
        mov             lr,  r4
813
        .endfunc
814

    
815
function \type\()_h264_qpel8_v_lowpass_neon
816
        vld1.64         {d8},  [r1], r3
817
        vld1.64         {d10}, [r1], r3
818
        vld1.64         {d12}, [r1], r3
819
        vld1.64         {d14}, [r1], r3
820
        vld1.64         {d22}, [r1], r3
821
        vld1.64         {d24}, [r1], r3
822
        vld1.64         {d26}, [r1], r3
823
        vld1.64         {d28}, [r1], r3
824
        vld1.64         {d9},  [r1], r3
825
        vld1.64         {d11}, [r1], r3
826
        vld1.64         {d13}, [r1], r3
827
        vld1.64         {d15}, [r1], r3
828
        vld1.64         {d23}, [r1]
829

    
830
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
831
        lowpass_8       d8,  d9,  d10, d11, d8,  d10
832
        lowpass_8       d12, d13, d14, d15, d12, d14
833
        lowpass_8       d22, d23, d24, d25, d22, d24
834
        lowpass_8       d26, d27, d28, d29, d26, d28
835
        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
836

    
837
.ifc \type,avg
838
        vld1.8          {d9},  [r0,:64], r2
839
        vrhadd.u8       d8,  d8,  d9
840
        vld1.8          {d11}, [r0,:64], r2
841
        vrhadd.u8       d10, d10, d11
842
        vld1.8          {d13}, [r0,:64], r2
843
        vrhadd.u8       d12, d12, d13
844
        vld1.8          {d15}, [r0,:64], r2
845
        vrhadd.u8       d14, d14, d15
846
        vld1.8          {d23}, [r0,:64], r2
847
        vrhadd.u8       d22, d22, d23
848
        vld1.8          {d25}, [r0,:64], r2
849
        vrhadd.u8       d24, d24, d25
850
        vld1.8          {d27}, [r0,:64], r2
851
        vrhadd.u8       d26, d26, d27
852
        vld1.8          {d29}, [r0,:64], r2
853
        vrhadd.u8       d28, d28, d29
854
        sub             r0,  r0,  r2,  lsl #3
855
.endif
856

    
857
        vst1.64         {d8},  [r0,:64], r2
858
        vst1.64         {d10}, [r0,:64], r2
859
        vst1.64         {d12}, [r0,:64], r2
860
        vst1.64         {d14}, [r0,:64], r2
861
        vst1.64         {d22}, [r0,:64], r2
862
        vst1.64         {d24}, [r0,:64], r2
863
        vst1.64         {d26}, [r0,:64], r2
864
        vst1.64         {d28}, [r0,:64], r2
865

    
866
        bx              lr
867
        .endfunc
868
        .endm
869

    
870
        h264_qpel_v_lowpass put
871
        h264_qpel_v_lowpass avg
872

    
873
        .macro h264_qpel_v_lowpass_l2 type
874
function \type\()_h264_qpel16_v_lowpass_l2_neon
875
        mov             r4,  lr
876
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
877
        sub             r1,  r1,  r3, lsl #2
878
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
879
        sub             r0,  r0,  r3, lsl #4
880
        sub             ip,  ip,  r2, lsl #4
881
        add             r0,  r0,  #8
882
        add             ip,  ip,  #8
883
        sub             r1,  r1,  r3, lsl #4
884
        sub             r1,  r1,  r3, lsl #2
885
        add             r1,  r1,  #8
886
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
887
        sub             r1,  r1,  r3, lsl #2
888
        mov             lr,  r4
889
        .endfunc
890

    
891
function \type\()_h264_qpel8_v_lowpass_l2_neon
892
        vld1.64         {d8},  [r1], r3
893
        vld1.64         {d10}, [r1], r3
894
        vld1.64         {d12}, [r1], r3
895
        vld1.64         {d14}, [r1], r3
896
        vld1.64         {d22}, [r1], r3
897
        vld1.64         {d24}, [r1], r3
898
        vld1.64         {d26}, [r1], r3
899
        vld1.64         {d28}, [r1], r3
900
        vld1.64         {d9},  [r1], r3
901
        vld1.64         {d11}, [r1], r3
902
        vld1.64         {d13}, [r1], r3
903
        vld1.64         {d15}, [r1], r3
904
        vld1.64         {d23}, [r1]
905

    
906
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
907
        lowpass_8       d8,  d9,  d10, d11, d8,  d9
908
        lowpass_8       d12, d13, d14, d15, d12, d13
909
        lowpass_8       d22, d23, d24, d25, d22, d23
910
        lowpass_8       d26, d27, d28, d29, d26, d27
911
        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
912

    
913
        vld1.64         {d0},  [ip], r2
914
        vld1.64         {d1},  [ip], r2
915
        vld1.64         {d2},  [ip], r2
916
        vld1.64         {d3},  [ip], r2
917
        vld1.64         {d4},  [ip], r2
918
        vrhadd.u8       q0,  q0,  q4
919
        vld1.64         {d5},  [ip], r2
920
        vrhadd.u8       q1,  q1,  q6
921
        vld1.64         {d10}, [ip], r2
922
        vrhadd.u8       q2,  q2,  q11
923
        vld1.64         {d11}, [ip], r2
924
        vrhadd.u8       q5,  q5,  q13
925

    
926
.ifc \type,avg
927
        vld1.8          {d16}, [r0,:64], r3
928
        vrhadd.u8       d0,  d0,  d16
929
        vld1.8          {d17}, [r0,:64], r3
930
        vrhadd.u8       d1,  d1,  d17
931
        vld1.8          {d16}, [r0,:64], r3
932
        vrhadd.u8       d2,  d2,  d16
933
        vld1.8          {d17}, [r0,:64], r3
934
        vrhadd.u8       d3,  d3,  d17
935
        vld1.8          {d16}, [r0,:64], r3
936
        vrhadd.u8       d4,  d4,  d16
937
        vld1.8          {d17}, [r0,:64], r3
938
        vrhadd.u8       d5,  d5,  d17
939
        vld1.8          {d16}, [r0,:64], r3
940
        vrhadd.u8       d10, d10, d16
941
        vld1.8          {d17}, [r0,:64], r3
942
        vrhadd.u8       d11, d11, d17
943
        sub             r0,  r0,  r3,  lsl #3
944
.endif
945

    
946
        vst1.64         {d0},  [r0,:64], r3
947
        vst1.64         {d1},  [r0,:64], r3
948
        vst1.64         {d2},  [r0,:64], r3
949
        vst1.64         {d3},  [r0,:64], r3
950
        vst1.64         {d4},  [r0,:64], r3
951
        vst1.64         {d5},  [r0,:64], r3
952
        vst1.64         {d10}, [r0,:64], r3
953
        vst1.64         {d11}, [r0,:64], r3
954

    
955
        bx              lr
956
        .endfunc
957
        .endm
958

    
959
        h264_qpel_v_lowpass_l2 put
960
        h264_qpel_v_lowpass_l2 avg
961

    
962
function put_h264_qpel8_hv_lowpass_neon_top
963
        lowpass_const   ip
964
        mov             ip,  #12
965
1:      vld1.64         {d0, d1},  [r1], r3
966
        vld1.64         {d16,d17}, [r1], r3
967
        subs            ip,  ip,  #2
968
        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
969
        vst1.64         {d22-d25}, [r4,:128]!
970
        bne             1b
971

    
972
        vld1.64         {d0, d1},  [r1]
973
        lowpass_8_1     d0,  d1,  q12, narrow=0
974

    
975
        mov             ip,  #-16
976
        add             r4,  r4,  ip
977
        vld1.64         {d30,d31}, [r4,:128], ip
978
        vld1.64         {d20,d21}, [r4,:128], ip
979
        vld1.64         {d18,d19}, [r4,:128], ip
980
        vld1.64         {d16,d17}, [r4,:128], ip
981
        vld1.64         {d14,d15}, [r4,:128], ip
982
        vld1.64         {d12,d13}, [r4,:128], ip
983
        vld1.64         {d10,d11}, [r4,:128], ip
984
        vld1.64         {d8, d9},  [r4,:128], ip
985
        vld1.64         {d6, d7},  [r4,:128], ip
986
        vld1.64         {d4, d5},  [r4,:128], ip
987
        vld1.64         {d2, d3},  [r4,:128], ip
988
        vld1.64         {d0, d1},  [r4,:128]
989

    
990
        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
991
        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
992

    
993
        swap4           d17, d19, d21, d31, d24, d26, d28, d22
994
        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
995

    
996
        vst1.64         {d30,d31}, [r4,:128]!
997
        vst1.64         {d6, d7},  [r4,:128]!
998
        vst1.64         {d20,d21}, [r4,:128]!
999
        vst1.64         {d4, d5},  [r4,:128]!
1000
        vst1.64         {d18,d19}, [r4,:128]!
1001
        vst1.64         {d2, d3},  [r4,:128]!
1002
        vst1.64         {d16,d17}, [r4,:128]!
1003
        vst1.64         {d0, d1},  [r4,:128]
1004

    
1005
        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
1006
        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
1007
        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
1008
        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
1009

    
1010
        vld1.64         {d16,d17}, [r4,:128], ip
1011
        vld1.64         {d30,d31}, [r4,:128], ip
1012
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
1013
        vld1.64         {d16,d17}, [r4,:128], ip
1014
        vld1.64         {d30,d31}, [r4,:128], ip
1015
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
1016
        vld1.64         {d16,d17}, [r4,:128], ip
1017
        vld1.64         {d30,d31}, [r4,:128], ip
1018
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
1019
        vld1.64         {d16,d17}, [r4,:128], ip
1020
        vld1.64         {d30,d31}, [r4,:128]
1021
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
1022

    
1023
        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
1024

    
1025
        bx              lr
1026
        .endfunc
1027

    
1028
        .macro h264_qpel8_hv_lowpass type
1029
function \type\()_h264_qpel8_hv_lowpass_neon
1030
        mov             r10, lr
1031
        bl              put_h264_qpel8_hv_lowpass_neon_top
1032
.ifc \type,avg
1033
        vld1.8          {d0},      [r0,:64], r2
1034
        vrhadd.u8       d12, d12, d0
1035
        vld1.8          {d1},      [r0,:64], r2
1036
        vrhadd.u8       d13, d13, d1
1037
        vld1.8          {d2},      [r0,:64], r2
1038
        vrhadd.u8       d14, d14, d2
1039
        vld1.8          {d3},      [r0,:64], r2
1040
        vrhadd.u8       d15, d15, d3
1041
        vld1.8          {d4},      [r0,:64], r2
1042
        vrhadd.u8       d8,  d8,  d4
1043
        vld1.8          {d5},      [r0,:64], r2
1044
        vrhadd.u8       d9,  d9,  d5
1045
        vld1.8          {d6},      [r0,:64], r2
1046
        vrhadd.u8       d10, d10, d6
1047
        vld1.8          {d7},      [r0,:64], r2
1048
        vrhadd.u8       d11, d11, d7
1049
        sub             r0,  r0,  r2,  lsl #3
1050
.endif
1051
        vst1.64         {d12},     [r0,:64], r2
1052
        vst1.64         {d13},     [r0,:64], r2
1053
        vst1.64         {d14},     [r0,:64], r2
1054
        vst1.64         {d15},     [r0,:64], r2
1055
        vst1.64         {d8},      [r0,:64], r2
1056
        vst1.64         {d9},      [r0,:64], r2
1057
        vst1.64         {d10},     [r0,:64], r2
1058
        vst1.64         {d11},     [r0,:64], r2
1059

    
1060
        mov             lr,  r10
1061
        bx              lr
1062
        .endfunc
1063
        .endm
1064

    
1065
        h264_qpel8_hv_lowpass put
1066
        h264_qpel8_hv_lowpass avg
1067

    
1068
        .macro h264_qpel8_hv_lowpass_l2 type
1069
function \type\()_h264_qpel8_hv_lowpass_l2_neon
1070
        mov             r10, lr
1071
        bl              put_h264_qpel8_hv_lowpass_neon_top
1072

    
1073
        vld1.64         {d0, d1},  [r2,:128]!
1074
        vld1.64         {d2, d3},  [r2,:128]!
1075
        vrhadd.u8       q0,  q0,  q6
1076
        vld1.64         {d4, d5},  [r2,:128]!
1077
        vrhadd.u8       q1,  q1,  q7
1078
        vld1.64         {d6, d7},  [r2,:128]!
1079
        vrhadd.u8       q2,  q2,  q4
1080
        vrhadd.u8       q3,  q3,  q5
1081
.ifc \type,avg
1082
        vld1.8          {d16},     [r0,:64], r3
1083
        vrhadd.u8       d0,  d0,  d16
1084
        vld1.8          {d17},     [r0,:64], r3
1085
        vrhadd.u8       d1,  d1,  d17
1086
        vld1.8          {d18},     [r0,:64], r3
1087
        vrhadd.u8       d2,  d2,  d18
1088
        vld1.8          {d19},     [r0,:64], r3
1089
        vrhadd.u8       d3,  d3,  d19
1090
        vld1.8          {d20},     [r0,:64], r3
1091
        vrhadd.u8       d4,  d4,  d20
1092
        vld1.8          {d21},     [r0,:64], r3
1093
        vrhadd.u8       d5,  d5,  d21
1094
        vld1.8          {d22},     [r0,:64], r3
1095
        vrhadd.u8       d6,  d6,  d22
1096
        vld1.8          {d23},     [r0,:64], r3
1097
        vrhadd.u8       d7,  d7,  d23
1098
        sub             r0,  r0,  r3,  lsl #3
1099
.endif
1100
        vst1.64         {d0},      [r0,:64], r3
1101
        vst1.64         {d1},      [r0,:64], r3
1102
        vst1.64         {d2},      [r0,:64], r3
1103
        vst1.64         {d3},      [r0,:64], r3
1104
        vst1.64         {d4},      [r0,:64], r3
1105
        vst1.64         {d5},      [r0,:64], r3
1106
        vst1.64         {d6},      [r0,:64], r3
1107
        vst1.64         {d7},      [r0,:64], r3
1108

    
1109
        mov             lr,  r10
1110
        bx              lr
1111
        .endfunc
1112
        .endm
1113

    
1114
        h264_qpel8_hv_lowpass_l2 put
1115
        h264_qpel8_hv_lowpass_l2 avg
1116

    
1117
        .macro h264_qpel16_hv type
1118
function \type\()_h264_qpel16_hv_lowpass_neon
1119
        mov             r9,  lr
1120
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1121
        sub             r1,  r1,  r3, lsl #2
1122
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1123
        sub             r1,  r1,  r3, lsl #4
1124
        sub             r1,  r1,  r3, lsl #2
1125
        add             r1,  r1,  #8
1126
        sub             r0,  r0,  r2, lsl #4
1127
        add             r0,  r0,  #8
1128
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1129
        sub             r1,  r1,  r3, lsl #2
1130
        mov             lr,  r9
1131
        b               \type\()_h264_qpel8_hv_lowpass_neon
1132
        .endfunc
1133

    
1134
function \type\()_h264_qpel16_hv_lowpass_l2_neon
1135
        mov             r9,  lr
1136
        sub             r2,  r4,  #256
1137
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1138
        sub             r1,  r1,  r3, lsl #2
1139
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1140
        sub             r1,  r1,  r3, lsl #4
1141
        sub             r1,  r1,  r3, lsl #2
1142
        add             r1,  r1,  #8
1143
        sub             r0,  r0,  r3, lsl #4
1144
        add             r0,  r0,  #8
1145
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1146
        sub             r1,  r1,  r3, lsl #2
1147
        mov             lr,  r9
1148
        b               \type\()_h264_qpel8_hv_lowpass_l2_neon
1149
        .endfunc
1150
        .endm
1151

    
1152
        h264_qpel16_hv put
1153
        h264_qpel16_hv avg
1154

    
1155
        .macro h264_qpel8 type
1156
function ff_\type\()_h264_qpel8_mc10_neon, export=1
1157
        lowpass_const   r3
1158
        mov             r3,  r1
1159
        sub             r1,  r1,  #2
1160
        mov             ip,  #8
1161
        b               \type\()_h264_qpel8_h_lowpass_l2_neon
1162
        .endfunc
1163

    
1164
function ff_\type\()_h264_qpel8_mc20_neon, export=1
1165
        lowpass_const   r3
1166
        sub             r1,  r1,  #2
1167
        mov             r3,  r2
1168
        mov             ip,  #8
1169
        b               \type\()_h264_qpel8_h_lowpass_neon
1170
        .endfunc
1171

    
1172
function ff_\type\()_h264_qpel8_mc30_neon, export=1
1173
        lowpass_const   r3
1174
        add             r3,  r1,  #1
1175
        sub             r1,  r1,  #2
1176
        mov             ip,  #8
1177
        b               \type\()_h264_qpel8_h_lowpass_l2_neon
1178
        .endfunc
1179

    
1180
function ff_\type\()_h264_qpel8_mc01_neon, export=1
1181
        push            {lr}
1182
        mov             ip,  r1
1183
\type\()_h264_qpel8_mc01:
1184
        lowpass_const   r3
1185
        mov             r3,  r2
1186
        sub             r1,  r1,  r2, lsl #1
1187
        vpush           {d8-d15}
1188
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1189
        vpop            {d8-d15}
1190
        pop             {pc}
1191
        .endfunc
1192

    
1193
function ff_\type\()_h264_qpel8_mc11_neon, export=1
1194
        push            {r0, r1, r11, lr}
1195
\type\()_h264_qpel8_mc11:
1196
        lowpass_const   r3
1197
        mov             r11, sp
1198
        bic             sp,  sp,  #15
1199
        sub             sp,  sp,  #64
1200
        mov             r0,  sp
1201
        sub             r1,  r1,  #2
1202
        mov             r3,  #8
1203
        mov             ip,  #8
1204
        vpush           {d8-d15}
1205
        bl              put_h264_qpel8_h_lowpass_neon
1206
        ldrd            r0,  [r11]
1207
        mov             r3,  r2
1208
        add             ip,  sp,  #64
1209
        sub             r1,  r1,  r2, lsl #1
1210
        mov             r2,  #8
1211
        bl              \type\()_h264_qpel8_v_lowpass_l2_neon
1212
        vpop            {d8-d15}
1213
        add             sp,  r11, #8
1214
        pop             {r11, pc}
1215
        .endfunc
1216

    
1217
function ff_\type\()_h264_qpel8_mc21_neon, export=1
1218
        push            {r0, r1, r4, r10, r11, lr}
1219
\type\()_h264_qpel8_mc21:
1220
        lowpass_const   r3
1221
        mov             r11, sp
1222
        bic             sp,  sp,  #15
1223
        sub             sp,  sp,  #(8*8+16*12)
1224
        sub             r1,  r1,  #2
1225
        mov             r3,  #8
1226
        mov             r0,  sp
1227
        mov             ip,  #8
1228
        vpush           {d8-d15}
1229
        bl              put_h264_qpel8_h_lowpass_neon
1230
        mov             r4,  r0
1231
        ldrd            r0,  [r11]
1232
        sub             r1,  r1,  r2, lsl #1
1233
        sub             r1,  r1,  #2
1234
        mov             r3,  r2
1235
        sub             r2,  r4,  #64
1236
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1237
        vpop            {d8-d15}
1238
        add             sp,  r11,  #8
1239
        pop             {r4, r10, r11, pc}
1240
        .endfunc
1241

    
1242
function ff_\type\()_h264_qpel8_mc31_neon, export=1
1243
        add             r1,  r1,  #1
1244
        push            {r0, r1, r11, lr}
1245
        sub             r1,  r1,  #1
1246
        b               \type\()_h264_qpel8_mc11
1247
        .endfunc
1248

    
1249
function ff_\type\()_h264_qpel8_mc02_neon, export=1
1250
        push            {lr}
1251
        lowpass_const   r3
1252
        sub             r1,  r1,  r2, lsl #1
1253
        mov             r3,  r2
1254
        vpush           {d8-d15}
1255
        bl              \type\()_h264_qpel8_v_lowpass_neon
1256
        vpop            {d8-d15}
1257
        pop             {pc}
1258
        .endfunc
1259

    
1260
function ff_\type\()_h264_qpel8_mc12_neon, export=1
1261
        push            {r0, r1, r4, r10, r11, lr}
1262
\type\()_h264_qpel8_mc12:
1263
        lowpass_const   r3
1264
        mov             r11, sp
1265
        bic             sp,  sp,  #15
1266
        sub             sp,  sp,  #(8*8+16*12)
1267
        sub             r1,  r1,  r2, lsl #1
1268
        mov             r3,  r2
1269
        mov             r2,  #8
1270
        mov             r0,  sp
1271
        vpush           {d8-d15}
1272
        bl              put_h264_qpel8_v_lowpass_neon
1273
        mov             r4,  r0
1274
        ldrd            r0,  [r11]
1275
        sub             r1,  r1,  r3, lsl #1
1276
        sub             r1,  r1,  #2
1277
        sub             r2,  r4,  #64
1278
        bl              \type\()_h264_qpel8_hv_lowpass_l2_neon
1279
        vpop            {d8-d15}
1280
        add             sp,  r11,  #8
1281
        pop             {r4, r10, r11, pc}
1282
        .endfunc
1283

    
1284
function ff_\type\()_h264_qpel8_mc22_neon, export=1
1285
        push            {r4, r10, r11, lr}
1286
        mov             r11, sp
1287
        bic             sp,  sp,  #15
1288
        sub             r1,  r1,  r2, lsl #1
1289
        sub             r1,  r1,  #2
1290
        mov             r3,  r2
1291
        sub             sp,  sp,  #(16*12)
1292
        mov             r4,  sp
1293
        vpush           {d8-d15}
1294
        bl              \type\()_h264_qpel8_hv_lowpass_neon
1295
        vpop            {d8-d15}
1296
        mov             sp,  r11
1297
        pop             {r4, r10, r11, pc}
1298
        .endfunc
1299

    
1300
function ff_\type\()_h264_qpel8_mc32_neon, export=1
1301
        push            {r0, r1, r4, r10, r11, lr}
1302
        add             r1,  r1,  #1
1303
        b               \type\()_h264_qpel8_mc12
1304
        .endfunc
1305

    
1306
function ff_\type\()_h264_qpel8_mc03_neon, export=1
1307
        push            {lr}
1308
        add             ip,  r1,  r2
1309
        b               \type\()_h264_qpel8_mc01
1310
        .endfunc
1311

    
1312
function ff_\type\()_h264_qpel8_mc13_neon, export=1
1313
        push            {r0, r1, r11, lr}
1314
        add             r1,  r1,  r2
1315
        b               \type\()_h264_qpel8_mc11
1316
        .endfunc
1317

    
1318
function ff_\type\()_h264_qpel8_mc23_neon, export=1
1319
        push            {r0, r1, r4, r10, r11, lr}
1320
        add             r1,  r1,  r2
1321
        b               \type\()_h264_qpel8_mc21
1322
        .endfunc
1323

    
1324
function ff_\type\()_h264_qpel8_mc33_neon, export=1
1325
        add             r1,  r1,  #1
1326
        push            {r0, r1, r11, lr}
1327
        add             r1,  r1,  r2
1328
        sub             r1,  r1,  #1
1329
        b               \type\()_h264_qpel8_mc11
1330
        .endfunc
1331
        .endm
1332

    
1333
        h264_qpel8 put
1334
        h264_qpel8 avg
1335

    
1336
        .macro h264_qpel16 type
1337
function ff_\type\()_h264_qpel16_mc10_neon, export=1
1338
        lowpass_const   r3
1339
        mov             r3,  r1
1340
        sub             r1,  r1,  #2
1341
        b               \type\()_h264_qpel16_h_lowpass_l2_neon
1342
        .endfunc
1343

    
1344
function ff_\type\()_h264_qpel16_mc20_neon, export=1
1345
        lowpass_const   r3
1346
        sub             r1,  r1,  #2
1347
        mov             r3,  r2
1348
        b               \type\()_h264_qpel16_h_lowpass_neon
1349
        .endfunc
1350

    
1351
function ff_\type\()_h264_qpel16_mc30_neon, export=1
1352
        lowpass_const   r3
1353
        add             r3,  r1,  #1
1354
        sub             r1,  r1,  #2
1355
        b               \type\()_h264_qpel16_h_lowpass_l2_neon
1356
        .endfunc
1357

    
1358
function ff_\type\()_h264_qpel16_mc01_neon, export=1
1359
        push            {r4, lr}
1360
        mov             ip,  r1
1361
\type\()_h264_qpel16_mc01:
1362
        lowpass_const   r3
1363
        mov             r3,  r2
1364
        sub             r1,  r1,  r2, lsl #1
1365
        vpush           {d8-d15}
1366
        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1367
        vpop            {d8-d15}
1368
        pop             {r4, pc}
1369
        .endfunc
1370

    
1371
function ff_\type\()_h264_qpel16_mc11_neon, export=1
1372
        push            {r0, r1, r4, r11, lr}
1373
\type\()_h264_qpel16_mc11:
1374
        lowpass_const   r3
1375
        mov             r11, sp
1376
        bic             sp,  sp,  #15
1377
        sub             sp,  sp,  #256
1378
        mov             r0,  sp
1379
        sub             r1,  r1,  #2
1380
        mov             r3,  #16
1381
        vpush           {d8-d15}
1382
        bl              put_h264_qpel16_h_lowpass_neon
1383
        ldrd            r0,  [r11]
1384
        mov             r3,  r2
1385
        add             ip,  sp,  #64
1386
        sub             r1,  r1,  r2, lsl #1
1387
        mov             r2,  #16
1388
        bl              \type\()_h264_qpel16_v_lowpass_l2_neon
1389
        vpop            {d8-d15}
1390
        add             sp,  r11, #8
1391
        pop             {r4, r11, pc}
1392
        .endfunc
1393

    
1394
function ff_\type\()_h264_qpel16_mc21_neon, export=1
1395
        push            {r0, r1, r4-r5, r9-r11, lr}
1396
\type\()_h264_qpel16_mc21:
1397
        lowpass_const   r3
1398
        mov             r11, sp
1399
        bic             sp,  sp,  #15
1400
        sub             sp,  sp,  #(16*16+16*12)
1401
        sub             r1,  r1,  #2
1402
        mov             r0,  sp
1403
        vpush           {d8-d15}
1404
        bl              put_h264_qpel16_h_lowpass_neon_packed
1405
        mov             r4,  r0
1406
        ldrd            r0,  [r11]
1407
        sub             r1,  r1,  r2, lsl #1
1408
        sub             r1,  r1,  #2
1409
        mov             r3,  r2
1410
        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1411
        vpop            {d8-d15}
1412
        add             sp,  r11,  #8
1413
        pop             {r4-r5, r9-r11, pc}
1414
        .endfunc
1415

    
1416
function ff_\type\()_h264_qpel16_mc31_neon, export=1
1417
        add             r1,  r1,  #1
1418
        push            {r0, r1, r4, r11, lr}
1419
        sub             r1,  r1,  #1
1420
        b               \type\()_h264_qpel16_mc11
1421
        .endfunc
1422

    
1423
function ff_\type\()_h264_qpel16_mc02_neon, export=1
1424
        push            {r4, lr}
1425
        lowpass_const   r3
1426
        sub             r1,  r1,  r2, lsl #1
1427
        mov             r3,  r2
1428
        vpush           {d8-d15}
1429
        bl              \type\()_h264_qpel16_v_lowpass_neon
1430
        vpop            {d8-d15}
1431
        pop             {r4, pc}
1432
        .endfunc
1433

    
1434
function ff_\type\()_h264_qpel16_mc12_neon, export=1
1435
        push            {r0, r1, r4-r5, r9-r11, lr}
1436
\type\()_h264_qpel16_mc12:
1437
        lowpass_const   r3
1438
        mov             r11, sp
1439
        bic             sp,  sp,  #15
1440
        sub             sp,  sp,  #(16*16+16*12)
1441
        sub             r1,  r1,  r2, lsl #1
1442
        mov             r0,  sp
1443
        mov             r3,  r2
1444
        vpush           {d8-d15}
1445
        bl              put_h264_qpel16_v_lowpass_neon_packed
1446
        mov             r4,  r0
1447
        ldrd            r0,  [r11]
1448
        sub             r1,  r1,  r3, lsl #1
1449
        sub             r1,  r1,  #2
1450
        mov             r2,  r3
1451
        bl              \type\()_h264_qpel16_hv_lowpass_l2_neon
1452
        vpop            {d8-d15}
1453
        add             sp,  r11,  #8
1454
        pop             {r4-r5, r9-r11, pc}
1455
        .endfunc
1456

    
1457
function ff_\type\()_h264_qpel16_mc22_neon, export=1
1458
        push            {r4, r9-r11, lr}
1459
        lowpass_const   r3
1460
        mov             r11, sp
1461
        bic             sp,  sp,  #15
1462
        sub             r1,  r1,  r2, lsl #1
1463
        sub             r1,  r1,  #2
1464
        mov             r3,  r2
1465
        sub             sp,  sp,  #(16*12)
1466
        mov             r4,  sp
1467
        vpush           {d8-d15}
1468
        bl              \type\()_h264_qpel16_hv_lowpass_neon
1469
        vpop            {d8-d15}
1470
        mov             sp,  r11
1471
        pop             {r4, r9-r11, pc}
1472
        .endfunc
1473

    
1474
function ff_\type\()_h264_qpel16_mc32_neon, export=1
1475
        push            {r0, r1, r4-r5, r9-r11, lr}
1476
        add             r1,  r1,  #1
1477
        b               \type\()_h264_qpel16_mc12
1478
        .endfunc
1479

    
1480
function ff_\type\()_h264_qpel16_mc03_neon, export=1
1481
        push            {r4, lr}
1482
        add             ip,  r1,  r2
1483
        b               \type\()_h264_qpel16_mc01
1484
        .endfunc
1485

    
1486
function ff_\type\()_h264_qpel16_mc13_neon, export=1
1487
        push            {r0, r1, r4, r11, lr}
1488
        add             r1,  r1,  r2
1489
        b               \type\()_h264_qpel16_mc11
1490
        .endfunc
1491

    
1492
function ff_\type\()_h264_qpel16_mc23_neon, export=1
1493
        push            {r0, r1, r4-r5, r9-r11, lr}
1494
        add             r1,  r1,  r2
1495
        b               \type\()_h264_qpel16_mc21
1496
        .endfunc
1497

    
1498
function ff_\type\()_h264_qpel16_mc33_neon, export=1
1499
        add             r1,  r1,  #1
1500
        push            {r0, r1, r4, r11, lr}
1501
        add             r1,  r1,  r2
1502
        sub             r1,  r1,  #1
1503
        b               \type\()_h264_qpel16_mc11
1504
        .endfunc
1505
        .endm
1506

    
1507
        h264_qpel16 put
1508
        h264_qpel16 avg
1509

    
1510
@ Biweighted prediction
1511

    
1512
        .macro  biweight_16 macs, macd
1513
        vdup.8          d0,  r4
1514
        vdup.8          d1,  r5
1515
        vmov            q2,  q8
1516
        vmov            q3,  q8
1517
1:      subs            ip,  ip,  #2
1518
        vld1.8          {d20-d21},[r0,:128], r2
1519
        \macd           q2,  d0,  d20
1520
        pld             [r0]
1521
        \macd           q3,  d0,  d21
1522
        vld1.8          {d22-d23},[r1,:128], r2
1523
        \macs           q2,  d1,  d22
1524
        pld             [r1]
1525
        \macs           q3,  d1,  d23
1526
        vmov            q12, q8
1527
        vld1.8          {d28-d29},[r0,:128], r2
1528
        vmov            q13, q8
1529
        \macd           q12, d0,  d28
1530
        pld             [r0]
1531
        \macd           q13, d0,  d29
1532
        vld1.8          {d30-d31},[r1,:128], r2
1533
        \macs           q12, d1,  d30
1534
        pld             [r1]
1535
        \macs           q13, d1,  d31
1536
        vshl.s16        q2,  q2,  q9
1537
        vshl.s16        q3,  q3,  q9
1538
        vqmovun.s16     d4,  q2
1539
        vqmovun.s16     d5,  q3
1540
        vshl.s16        q12, q12, q9
1541
        vshl.s16        q13, q13, q9
1542
        vqmovun.s16     d24, q12
1543
        vqmovun.s16     d25, q13
1544
        vmov            q3,  q8
1545
        vst1.8          {d4- d5}, [r6,:128], r2
1546
        vmov            q2,  q8
1547
        vst1.8          {d24-d25},[r6,:128], r2
1548
        bne             1b
1549
        pop             {r4-r6, pc}
1550
        .endm
1551

    
1552
        .macro  biweight_8 macs, macd
1553
        vdup.8          d0,  r4
1554
        vdup.8          d1,  r5
1555
        vmov            q1,  q8
1556
        vmov            q10, q8
1557
1:      subs            ip,  ip,  #2
1558
        vld1.8          {d4},[r0,:64], r2
1559
        \macd           q1,  d0,  d4
1560
        pld             [r0]
1561
        vld1.8          {d5},[r1,:64], r2
1562
        \macs           q1,  d1,  d5
1563
        pld             [r1]
1564
        vld1.8          {d6},[r0,:64], r2
1565
        \macd           q10, d0,  d6
1566
        pld             [r0]
1567
        vld1.8          {d7},[r1,:64], r2
1568
        \macs           q10, d1,  d7
1569
        pld             [r1]
1570
        vshl.s16        q1,  q1,  q9
1571
        vqmovun.s16     d2,  q1
1572
        vshl.s16        q10, q10, q9
1573
        vqmovun.s16     d4,  q10
1574
        vmov            q10, q8
1575
        vst1.8          {d2},[r6,:64], r2
1576
        vmov            q1,  q8
1577
        vst1.8          {d4},[r6,:64], r2
1578
        bne             1b
1579
        pop             {r4-r6, pc}
1580
        .endm
1581

    
1582
        .macro  biweight_4 macs, macd
1583
        vdup.8          d0,  r4
1584
        vdup.8          d1,  r5
1585
        vmov            q1,  q8
1586
        vmov            q10, q8
1587
1:      subs            ip,  ip,  #4
1588
        vld1.32         {d4[0]},[r0,:32], r2
1589
        vld1.32         {d4[1]},[r0,:32], r2
1590
        \macd           q1,  d0,  d4
1591
        pld             [r0]
1592
        vld1.32         {d5[0]},[r1,:32], r2
1593
        vld1.32         {d5[1]},[r1,:32], r2
1594
        \macs           q1,  d1,  d5
1595
        pld             [r1]
1596
        blt             2f
1597
        vld1.32         {d6[0]},[r0,:32], r2
1598
        vld1.32         {d6[1]},[r0,:32], r2
1599
        \macd           q10, d0,  d6
1600
        pld             [r0]
1601
        vld1.32         {d7[0]},[r1,:32], r2
1602
        vld1.32         {d7[1]},[r1,:32], r2
1603
        \macs           q10, d1,  d7
1604
        pld             [r1]
1605
        vshl.s16        q1,  q1,  q9
1606
        vqmovun.s16     d2,  q1
1607
        vshl.s16        q10, q10, q9
1608
        vqmovun.s16     d4,  q10
1609
        vmov            q10, q8
1610
        vst1.32         {d2[0]},[r6,:32], r2
1611
        vst1.32         {d2[1]},[r6,:32], r2
1612
        vmov            q1,  q8
1613
        vst1.32         {d4[0]},[r6,:32], r2
1614
        vst1.32         {d4[1]},[r6,:32], r2
1615
        bne             1b
1616
        pop             {r4-r6, pc}
1617
2:      vshl.s16        q1,  q1,  q9
1618
        vqmovun.s16     d2,  q1
1619
        vst1.32         {d2[0]},[r6,:32], r2
1620
        vst1.32         {d2[1]},[r6,:32], r2
1621
        pop             {r4-r6, pc}
1622
        .endm
1623

    
1624
        .macro  biweight_func w
1625
function biweight_h264_pixels_\w\()_neon
1626
        push            {r4-r6, lr}
1627
        add             r4,  sp,  #16
1628
        ldm             r4,  {r4-r6}
1629
        lsr             lr,  r4,  #31
1630
        add             r6,  r6,  #1
1631
        eors            lr,  lr,  r5,  lsr #30
1632
        orr             r6,  r6,  #1
1633
        vdup.16         q9,  r3
1634
        lsl             r6,  r6,  r3
1635
        vmvn            q9,  q9
1636
        vdup.16         q8,  r6
1637
        mov             r6,  r0
1638
        beq             10f
1639
        subs            lr,  lr,  #1
1640
        beq             20f
1641
        subs            lr,  lr,  #1
1642
        beq             30f
1643
        b               40f
1644
10:     biweight_\w     vmlal.u8, vmlal.u8
1645
20:     rsb             r4,  r4,  #0
1646
        biweight_\w     vmlal.u8, vmlsl.u8
1647
30:     rsb             r4,  r4,  #0
1648
        rsb             r5,  r5,  #0
1649
        biweight_\w     vmlsl.u8, vmlsl.u8
1650
40:     rsb             r5,  r5,  #0
1651
        biweight_\w     vmlsl.u8, vmlal.u8
1652
        .endfunc
1653
        .endm
1654

    
1655
        .macro  biweight_entry w, h, b=1
1656
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1657
        mov             ip,  #\h
1658
.if \b
1659
        b               biweight_h264_pixels_\w\()_neon
1660
.endif
1661
        .endfunc
1662
        .endm
1663

    
1664
        biweight_entry  16, 8
1665
        biweight_entry  16, 16, b=0
1666
        biweight_func   16
1667

    
1668
        biweight_entry  8,  16
1669
        biweight_entry  8,  4
1670
        biweight_entry  8,  8,  b=0
1671
        biweight_func   8
1672

    
1673
        biweight_entry  4,  8
1674
        biweight_entry  4,  2
1675
        biweight_entry  4,  4,  b=0
1676
        biweight_func   4
1677

    
1678
@ Weighted prediction
1679

    
1680
        .macro  weight_16 add
1681
        vdup.8          d0,  r3
1682
1:      subs            ip,  ip,  #2
1683
        vld1.8          {d20-d21},[r0,:128], r1
1684
        vmull.u8        q2,  d0,  d20
1685
        pld             [r0]
1686
        vmull.u8        q3,  d0,  d21
1687
        vld1.8          {d28-d29},[r0,:128], r1
1688
        vmull.u8        q12, d0,  d28
1689
        pld             [r0]
1690
        vmull.u8        q13, d0,  d29
1691
        \add            q2,  q8,  q2
1692
        vrshl.s16       q2,  q2,  q9
1693
        \add            q3,  q8,  q3
1694
        vrshl.s16       q3,  q3,  q9
1695
        vqmovun.s16     d4,  q2
1696
        vqmovun.s16     d5,  q3
1697
        \add            q12, q8,  q12
1698
        vrshl.s16       q12, q12, q9
1699
        \add            q13, q8,  q13
1700
        vrshl.s16       q13, q13, q9
1701
        vqmovun.s16     d24, q12
1702
        vqmovun.s16     d25, q13
1703
        vst1.8          {d4- d5}, [r4,:128], r1
1704
        vst1.8          {d24-d25},[r4,:128], r1
1705
        bne             1b
1706
        pop             {r4, pc}
1707
        .endm
1708

    
1709
        .macro  weight_8 add
1710
        vdup.8          d0,  r3
1711
1:      subs            ip,  ip,  #2
1712
        vld1.8          {d4},[r0,:64], r1
1713
        vmull.u8        q1,  d0,  d4
1714
        pld             [r0]
1715
        vld1.8          {d6},[r0,:64], r1
1716
        vmull.u8        q10, d0,  d6
1717
        \add            q1,  q8,  q1
1718
        pld             [r0]
1719
        vrshl.s16       q1,  q1,  q9
1720
        vqmovun.s16     d2,  q1
1721
        \add            q10, q8,  q10
1722
        vrshl.s16       q10, q10, q9
1723
        vqmovun.s16     d4,  q10
1724
        vst1.8          {d2},[r4,:64], r1
1725
        vst1.8          {d4},[r4,:64], r1
1726
        bne             1b
1727
        pop             {r4, pc}
1728
        .endm
1729

    
1730
        .macro  weight_4 add
1731
        vdup.8          d0,  r3
1732
        vmov            q1,  q8
1733
        vmov            q10, q8
1734
1:      subs            ip,  ip,  #4
1735
        vld1.32         {d4[0]},[r0,:32], r1
1736
        vld1.32         {d4[1]},[r0,:32], r1
1737
        vmull.u8        q1,  d0,  d4
1738
        pld             [r0]
1739
        blt             2f
1740
        vld1.32         {d6[0]},[r0,:32], r1
1741
        vld1.32         {d6[1]},[r0,:32], r1
1742
        vmull.u8        q10, d0,  d6
1743
        pld             [r0]
1744
        \add            q1,  q8,  q1
1745
        vrshl.s16       q1,  q1,  q9
1746
        vqmovun.s16     d2,  q1
1747
        \add            q10, q8,  q10
1748
        vrshl.s16       q10, q10, q9
1749
        vqmovun.s16     d4,  q10
1750
        vmov            q10, q8
1751
        vst1.32         {d2[0]},[r4,:32], r1
1752
        vst1.32         {d2[1]},[r4,:32], r1
1753
        vmov            q1,  q8
1754
        vst1.32         {d4[0]},[r4,:32], r1
1755
        vst1.32         {d4[1]},[r4,:32], r1
1756
        bne             1b
1757
        pop             {r4, pc}
1758
2:      \add            q1,  q8,  q1
1759
        vrshl.s16       q1,  q1,  q9
1760
        vqmovun.s16     d2,  q1
1761
        vst1.32         {d2[0]},[r4,:32], r1
1762
        vst1.32         {d2[1]},[r4,:32], r1
1763
        pop             {r4, pc}
1764
        .endm
1765

    
1766
        .macro  weight_func w
1767
function weight_h264_pixels_\w\()_neon
1768
        push            {r4, lr}
1769
        ldr             r4,  [sp, #8]
1770
        cmp             r2,  #1
1771
        lsl             r4,  r4,  r2
1772
        vdup.16         q8,  r4
1773
        mov             r4,  r0
1774
        ble             20f
1775
        rsb             lr,  r2,  #1
1776
        vdup.16         q9,  lr
1777
        cmp             r3,  #0
1778
        blt             10f
1779
        weight_\w       vhadd.s16
1780
10:     rsb             r3,  r3,  #0
1781
        weight_\w       vhsub.s16
1782
20:     rsb             lr,  r2,  #0
1783
        vdup.16         q9,  lr
1784
        cmp             r3,  #0
1785
        blt             10f
1786
        weight_\w       vadd.s16
1787
10:     rsb             r3,  r3,  #0
1788
        weight_\w       vsub.s16
1789
        .endfunc
1790
        .endm
1791

    
1792
        .macro  weight_entry w, h, b=1
1793
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1794
        mov             ip,  #\h
1795
.if \b
1796
        b               weight_h264_pixels_\w\()_neon
1797
.endif
1798
        .endfunc
1799
        .endm
1800

    
1801
        weight_entry    16, 8
1802
        weight_entry    16, 16, b=0
1803
        weight_func     16
1804

    
1805
        weight_entry    8,  16
1806
        weight_entry    8,  4
1807
        weight_entry    8,  8,  b=0
1808
        weight_func     8
1809

    
1810
        weight_entry    4,  8
1811
        weight_entry    4,  2
1812
        weight_entry    4,  4,  b=0
1813
        weight_func     4