Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / h264dsp_neon.S @ 0115b3ea

History | View | Annotate | Download (55.2 KB)

1 1cce897a Måns Rullgård
/*
2
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20
21
#include "asm.S"
22
23 5813e05d Måns Rullgård
        .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
24
        vtrn.32         \r0, \r4
25
        vtrn.32         \r1, \r5
26
        vtrn.32         \r2, \r6
27
        vtrn.32         \r3, \r7
28
        vtrn.16         \r0, \r2
29
        vtrn.16         \r1, \r3
30
        vtrn.16         \r4, \r6
31
        vtrn.16         \r5, \r7
32
        vtrn.8          \r0, \r1
33
        vtrn.8          \r2, \r3
34
        vtrn.8          \r4, \r5
35
        vtrn.8          \r6, \r7
36
        .endm
37
38 2da4e5e3 Måns Rullgård
        .macro transpose_4x4 r0 r1 r2 r3
39
        vtrn.16         \r0, \r2
40
        vtrn.16         \r1, \r3
41
        vtrn.8          \r0, \r1
42
        vtrn.8          \r2, \r3
43
        .endm
44
45 5813e05d Måns Rullgård
        .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
46
        vswp            \r0, \r4
47
        vswp            \r1, \r5
48
        vswp            \r2, \r6
49
        vswp            \r3, \r7
50
        .endm
51
52
        .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
53
        vtrn.32         \r0, \r2
54
        vtrn.32         \r1, \r3
55
        vtrn.32         \r4, \r6
56
        vtrn.32         \r5, \r7
57
        vtrn.16         \r0, \r1
58
        vtrn.16         \r2, \r3
59
        vtrn.16         \r4, \r5
60
        vtrn.16         \r6, \r7
61
        .endm
62
63 1cce897a Måns Rullgård
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
64 77c45373 Måns Rullgård
        .macro  h264_chroma_mc8 type
65
function ff_\type\()_h264_chroma_mc8_neon, export=1
66 1cce897a Måns Rullgård
        push            {r4-r7, lr}
67
        ldrd            r4,  [sp, #20]
68 77c45373 Måns Rullgård
.ifc \type,avg
69 1cce897a Måns Rullgård
        mov             lr,  r0
70
.endif
71
        pld             [r1]
72
        pld             [r1, r2]
73
74
        muls            r7,  r4,  r5
75
        rsb             r6,  r7,  r5,  lsl #3
76
        rsb             ip,  r7,  r4,  lsl #3
77
        sub             r4,  r7,  r4,  lsl #3
78
        sub             r4,  r4,  r5,  lsl #3
79
        add             r4,  r4,  #64
80
81
        beq             2f
82
83
        add             r5,  r1,  r2
84
85
        vdup.8          d0,  r4
86
        lsl             r4,  r2,  #1
87
        vdup.8          d1,  ip
88
        vld1.64         {d4, d5}, [r1], r4
89
        vdup.8          d2,  r6
90
        vld1.64         {d6, d7}, [r5], r4
91
        vdup.8          d3,  r7
92
93
        vext.8          d5,  d4,  d5,  #1
94
        vext.8          d7,  d6,  d7,  #1
95
96
1:      pld             [r5]
97
        vmull.u8        q8,  d4,  d0
98
        vmlal.u8        q8,  d5,  d1
99
        vld1.64         {d4, d5}, [r1], r4
100
        vmlal.u8        q8,  d6,  d2
101
        vext.8          d5,  d4,  d5,  #1
102
        vmlal.u8        q8,  d7,  d3
103
        vmull.u8        q9,  d6,  d0
104
        subs            r3,  r3,  #2
105
        vmlal.u8        q9,  d7,  d1
106
        vmlal.u8        q9,  d4,  d2
107
        vmlal.u8        q9,  d5,  d3
108
        vrshrn.u16      d16, q8,  #6
109
        vld1.64         {d6, d7}, [r5], r4
110
        pld             [r1]
111
        vrshrn.u16      d17, q9,  #6
112 77c45373 Måns Rullgård
.ifc \type,avg
113 1cce897a Måns Rullgård
        vld1.64         {d20}, [lr,:64], r2
114
        vld1.64         {d21}, [lr,:64], r2
115
        vrhadd.u8       q8,  q8,  q10
116
.endif
117
        vext.8          d7,  d6,  d7,  #1
118
        vst1.64         {d16}, [r0,:64], r2
119
        vst1.64         {d17}, [r0,:64], r2
120
        bgt             1b
121
122
        pop             {r4-r7, pc}
123
124
2:      tst             r6,  r6
125
        add             ip,  ip,  r6
126
        vdup.8          d0,  r4
127
        vdup.8          d1,  ip
128
129
        beq             4f
130
131
        add             r5,  r1,  r2
132
        lsl             r4,  r2,  #1
133
        vld1.64         {d4}, [r1], r4
134
        vld1.64         {d6}, [r5], r4
135
136
3:      pld             [r5]
137
        vmull.u8        q8,  d4,  d0
138
        vmlal.u8        q8,  d6,  d1
139
        vld1.64         {d4}, [r1], r4
140
        vmull.u8        q9,  d6,  d0
141
        vmlal.u8        q9,  d4,  d1
142
        vld1.64         {d6}, [r5], r4
143
        vrshrn.u16      d16, q8,  #6
144
        vrshrn.u16      d17, q9,  #6
145 77c45373 Måns Rullgård
.ifc \type,avg
146 1cce897a Måns Rullgård
        vld1.64         {d20}, [lr,:64], r2
147
        vld1.64         {d21}, [lr,:64], r2
148
        vrhadd.u8       q8,  q8,  q10
149
.endif
150
        subs            r3,  r3,  #2
151
        pld             [r1]
152
        vst1.64         {d16}, [r0,:64], r2
153
        vst1.64         {d17}, [r0,:64], r2
154
        bgt             3b
155
156
        pop             {r4-r7, pc}
157
158
4:      vld1.64         {d4, d5}, [r1], r2
159
        vld1.64         {d6, d7}, [r1], r2
160
        vext.8          d5,  d4,  d5,  #1
161
        vext.8          d7,  d6,  d7,  #1
162
163
5:      pld             [r1]
164
        subs            r3,  r3,  #2
165
        vmull.u8        q8,  d4,  d0
166
        vmlal.u8        q8,  d5,  d1
167
        vld1.64         {d4, d5}, [r1], r2
168
        vmull.u8        q9,  d6,  d0
169
        vmlal.u8        q9,  d7,  d1
170
        pld             [r1]
171
        vext.8          d5,  d4,  d5,  #1
172
        vrshrn.u16      d16, q8,  #6
173
        vrshrn.u16      d17, q9,  #6
174 77c45373 Måns Rullgård
.ifc \type,avg
175 1cce897a Måns Rullgård
        vld1.64         {d20}, [lr,:64], r2
176
        vld1.64         {d21}, [lr,:64], r2
177
        vrhadd.u8       q8,  q8,  q10
178
.endif
179
        vld1.64         {d6, d7}, [r1], r2
180
        vext.8          d7,  d6,  d7,  #1
181
        vst1.64         {d16}, [r0,:64], r2
182
        vst1.64         {d17}, [r0,:64], r2
183
        bgt             5b
184
185
        pop             {r4-r7, pc}
186 77c45373 Måns Rullgård
        .endfunc
187 1cce897a Måns Rullgård
        .endm
188
189
/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
190 77c45373 Måns Rullgård
        .macro  h264_chroma_mc4 type
191
function ff_\type\()_h264_chroma_mc4_neon, export=1
192 1cce897a Måns Rullgård
        push            {r4-r7, lr}
193
        ldrd            r4,  [sp, #20]
194 77c45373 Måns Rullgård
.ifc \type,avg
195 1cce897a Måns Rullgård
        mov             lr,  r0
196
.endif
197
        pld             [r1]
198
        pld             [r1, r2]
199
200
        muls            r7,  r4,  r5
201
        rsb             r6,  r7,  r5,  lsl #3
202
        rsb             ip,  r7,  r4,  lsl #3
203
        sub             r4,  r7,  r4,  lsl #3
204
        sub             r4,  r4,  r5,  lsl #3
205
        add             r4,  r4,  #64
206
207
        beq             2f
208
209
        add             r5,  r1,  r2
210
211
        vdup.8          d0,  r4
212
        lsl             r4,  r2,  #1
213
        vdup.8          d1,  ip
214
        vld1.64         {d4},     [r1], r4
215
        vdup.8          d2,  r6
216
        vld1.64         {d6},     [r5], r4
217
        vdup.8          d3,  r7
218
219
        vext.8          d5,  d4,  d5,  #1
220
        vext.8          d7,  d6,  d7,  #1
221
        vtrn.32         d4,  d5
222
        vtrn.32         d6,  d7
223
224
        vtrn.32         d0,  d1
225
        vtrn.32         d2,  d3
226
227
1:      pld             [r5]
228
        vmull.u8        q8,  d4,  d0
229
        vmlal.u8        q8,  d6,  d2
230
        vld1.64         {d4},     [r1], r4
231
        vext.8          d5,  d4,  d5,  #1
232
        vtrn.32         d4,  d5
233
        vmull.u8        q9,  d6,  d0
234
        vmlal.u8        q9,  d4,  d2
235
        vld1.64         {d6},     [r5], r4
236
        vadd.i16        d16, d16, d17
237
        vadd.i16        d17, d18, d19
238
        vrshrn.u16      d16, q8,  #6
239
        subs            r3,  r3,  #2
240
        pld             [r1]
241 77c45373 Måns Rullgård
.ifc \type,avg
242 1cce897a Måns Rullgård
        vld1.32         {d20[0]}, [lr,:32], r2
243
        vld1.32         {d20[1]}, [lr,:32], r2
244
        vrhadd.u8       d16, d16, d20
245
.endif
246
        vext.8          d7,  d6,  d7,  #1
247
        vtrn.32         d6,  d7
248
        vst1.32         {d16[0]}, [r0,:32], r2
249
        vst1.32         {d16[1]}, [r0,:32], r2
250
        bgt             1b
251
252
        pop             {r4-r7, pc}
253
254
2:      tst             r6,  r6
255
        add             ip,  ip,  r6
256
        vdup.8          d0,  r4
257
        vdup.8          d1,  ip
258
        vtrn.32         d0,  d1
259
260
        beq             4f
261
262
        vext.32         d1,  d0,  d1,  #1
263
        add             r5,  r1,  r2
264
        lsl             r4,  r2,  #1
265
        vld1.32         {d4[0]},  [r1], r4
266
        vld1.32         {d4[1]},  [r5], r4
267
268
3:      pld             [r5]
269
        vmull.u8        q8,  d4,  d0
270
        vld1.32         {d4[0]},  [r1], r4
271
        vmull.u8        q9,  d4,  d1
272
        vld1.32         {d4[1]},  [r5], r4
273
        vadd.i16        d16, d16, d17
274
        vadd.i16        d17, d18, d19
275
        vrshrn.u16      d16, q8,  #6
276 77c45373 Måns Rullgård
.ifc \type,avg
277 1cce897a Måns Rullgård
        vld1.32         {d20[0]}, [lr,:32], r2
278
        vld1.32         {d20[1]}, [lr,:32], r2
279
        vrhadd.u8       d16, d16, d20
280
.endif
281
        subs            r3,  r3,  #2
282
        pld             [r1]
283
        vst1.32         {d16[0]}, [r0,:32], r2
284
        vst1.32         {d16[1]}, [r0,:32], r2
285
        bgt             3b
286
287
        pop             {r4-r7, pc}
288
289
4:      vld1.64         {d4},     [r1], r2
290
        vld1.64         {d6},     [r1], r2
291
        vext.8          d5,  d4,  d5,  #1
292
        vext.8          d7,  d6,  d7,  #1
293
        vtrn.32         d4,  d5
294
        vtrn.32         d6,  d7
295
296
5:      vmull.u8        q8,  d4,  d0
297
        vmull.u8        q9,  d6,  d0
298
        subs            r3,  r3,  #2
299
        vld1.64         {d4},     [r1], r2
300
        vext.8          d5,  d4,  d5,  #1
301
        vtrn.32         d4,  d5
302
        vadd.i16        d16, d16, d17
303
        vadd.i16        d17, d18, d19
304
        pld             [r1]
305
        vrshrn.u16      d16, q8,  #6
306 77c45373 Måns Rullgård
.ifc \type,avg
307 1cce897a Måns Rullgård
        vld1.32         {d20[0]}, [lr,:32], r2
308
        vld1.32         {d20[1]}, [lr,:32], r2
309
        vrhadd.u8       d16, d16, d20
310
.endif
311
        vld1.64         {d6},     [r1], r2
312
        vext.8          d7,  d6,  d7,  #1
313
        vtrn.32         d6,  d7
314
        pld             [r1]
315
        vst1.32         {d16[0]}, [r0,:32], r2
316
        vst1.32         {d16[1]}, [r0,:32], r2
317
        bgt             5b
318
319
        pop             {r4-r7, pc}
320 77c45373 Måns Rullgård
        .endfunc
321 1cce897a Måns Rullgård
        .endm
322
323
        .text
324
        .align
325
326 77c45373 Måns Rullgård
        h264_chroma_mc8 put
327
        h264_chroma_mc8 avg
328
        h264_chroma_mc4 put
329
        h264_chroma_mc4 avg
330 ad74a0f8 Måns Rullgård
331
        /* H.264 loop filter */
332
333
        .macro h264_loop_filter_start
334
        ldr             ip,  [sp]
335
        tst             r2,  r2
336
        ldr             ip,  [ip]
337
        tstne           r3,  r3
338
        vmov.32         d24[0], ip
339
        and             ip,  ip,  ip, lsl #16
340
        bxeq            lr
341
        ands            ip,  ip,  ip, lsl #8
342
        bxlt            lr
343
        .endm
344
345
        .macro align_push_regs
346
        and             ip,  sp,  #15
347
        add             ip,  ip,  #32
348
        sub             sp,  sp,  ip
349
        vst1.64         {d12-d15}, [sp,:128]
350
        sub             sp,  sp,  #32
351
        vst1.64         {d8-d11},  [sp,:128]
352
        .endm
353
354
        .macro align_pop_regs
355
        vld1.64         {d8-d11},  [sp,:128]!
356
        vld1.64         {d12-d15}, [sp,:128], ip
357
        .endm
358
359
        .macro h264_loop_filter_luma
360
        vdup.8          q11, r2         @ alpha
361
        vmovl.u8        q12, d24
362
        vabd.u8         q6,  q8,  q0    @ abs(p0 - q0)
363
        vmovl.u16       q12, d24
364
        vabd.u8         q14, q9,  q8    @ abs(p1 - p0)
365
        vsli.16         q12, q12, #8
366
        vabd.u8         q15, q1,  q0    @ abs(q1 - q0)
367
        vsli.32         q12, q12, #16
368
        vclt.u8         q6,  q6,  q11   @ < alpha
369
        vdup.8          q11, r3         @ beta
370
        vclt.s8         q7,  q12, #0
371
        vclt.u8         q14, q14, q11   @ < beta
372
        vclt.u8         q15, q15, q11   @ < beta
373
        vbic            q6,  q6,  q7
374
        vabd.u8         q4,  q10, q8    @ abs(p2 - p0)
375
        vand            q6,  q6,  q14
376
        vabd.u8         q5,  q2,  q0    @ abs(q2 - q0)
377
        vclt.u8         q4,  q4,  q11   @ < beta
378
        vand            q6,  q6,  q15
379
        vclt.u8         q5,  q5,  q11   @ < beta
380
        vand            q4,  q4,  q6
381
        vand            q5,  q5,  q6
382
        vand            q12, q12, q6
383
        vrhadd.u8       q14, q8,  q0
384
        vsub.i8         q6,  q12, q4
385
        vqadd.u8        q7,  q9,  q12
386
        vhadd.u8        q10, q10, q14
387
        vsub.i8         q6,  q6,  q5
388
        vhadd.u8        q14, q2,  q14
389
        vmin.u8         q7,  q7,  q10
390
        vqsub.u8        q11, q9,  q12
391
        vqadd.u8        q2,  q1,  q12
392
        vmax.u8         q7,  q7,  q11
393
        vqsub.u8        q11, q1,  q12
394
        vmin.u8         q14, q2,  q14
395
        vmovl.u8        q2,  d0
396
        vmax.u8         q14, q14, q11
397
        vmovl.u8        q10, d1
398
        vsubw.u8        q2,  q2,  d16
399
        vsubw.u8        q10, q10, d17
400
        vshl.i16        q2,  q2,  #2
401
        vshl.i16        q10, q10, #2
402
        vaddw.u8        q2,  q2,  d18
403
        vaddw.u8        q10, q10, d19
404
        vsubw.u8        q2,  q2,  d2
405
        vsubw.u8        q10, q10, d3
406
        vrshrn.i16      d4,  q2,  #3
407
        vrshrn.i16      d5,  q10, #3
408
        vbsl            q4,  q7,  q9
409
        vbsl            q5,  q14, q1
410
        vneg.s8         q7,  q6
411
        vmovl.u8        q14, d16
412
        vmin.s8         q2,  q2,  q6
413
        vmovl.u8        q6,  d17
414
        vmax.s8         q2,  q2,  q7
415
        vmovl.u8        q11, d0
416
        vmovl.u8        q12, d1
417
        vaddw.s8        q14, q14, d4
418
        vaddw.s8        q6,  q6,  d5
419
        vsubw.s8        q11, q11, d4
420
        vsubw.s8        q12, q12, d5
421
        vqmovun.s16     d16, q14
422
        vqmovun.s16     d17, q6
423
        vqmovun.s16     d0,  q11
424
        vqmovun.s16     d1,  q12
425
        .endm
426
427
function ff_h264_v_loop_filter_luma_neon, export=1
428
        h264_loop_filter_start
429
430
        vld1.64         {d0, d1},  [r0,:128], r1
431
        vld1.64         {d2, d3},  [r0,:128], r1
432
        vld1.64         {d4, d5},  [r0,:128], r1
433
        sub             r0,  r0,  r1, lsl #2
434
        sub             r0,  r0,  r1, lsl #1
435
        vld1.64         {d20,d21}, [r0,:128], r1
436
        vld1.64         {d18,d19}, [r0,:128], r1
437
        vld1.64         {d16,d17}, [r0,:128], r1
438
439
        align_push_regs
440
441
        h264_loop_filter_luma
442
443
        sub             r0,  r0,  r1, lsl #1
444
        vst1.64         {d8, d9},  [r0,:128], r1
445
        vst1.64         {d16,d17}, [r0,:128], r1
446
        vst1.64         {d0, d1},  [r0,:128], r1
447
        vst1.64         {d10,d11}, [r0,:128]
448
449
        align_pop_regs
450
        bx              lr
451
        .endfunc
452
453
function ff_h264_h_loop_filter_luma_neon, export=1
454
        h264_loop_filter_start
455
456
        sub             r0,  r0,  #4
457
        vld1.64         {d6},  [r0], r1
458
        vld1.64         {d20}, [r0], r1
459
        vld1.64         {d18}, [r0], r1
460
        vld1.64         {d16}, [r0], r1
461
        vld1.64         {d0},  [r0], r1
462
        vld1.64         {d2},  [r0], r1
463
        vld1.64         {d4},  [r0], r1
464
        vld1.64         {d26}, [r0], r1
465
        vld1.64         {d7},  [r0], r1
466
        vld1.64         {d21}, [r0], r1
467
        vld1.64         {d19}, [r0], r1
468
        vld1.64         {d17}, [r0], r1
469
        vld1.64         {d1},  [r0], r1
470
        vld1.64         {d3},  [r0], r1
471
        vld1.64         {d5},  [r0], r1
472
        vld1.64         {d27}, [r0], r1
473
474 5813e05d Måns Rullgård
        transpose_8x8   q3, q10, q9, q8, q0, q1, q2, q13
475 ad74a0f8 Måns Rullgård
476
        align_push_regs
477
478
        h264_loop_filter_luma
479
480 2da4e5e3 Måns Rullgård
        transpose_4x4   q4, q8, q0, q5
481 ad74a0f8 Måns Rullgård
482
        sub             r0,  r0,  r1, lsl #4
483 2da4e5e3 Måns Rullgård
        add             r0,  r0,  #2
484
        vst1.32         {d8[0]},  [r0], r1
485
        vst1.32         {d16[0]}, [r0], r1
486
        vst1.32         {d0[0]},  [r0], r1
487
        vst1.32         {d10[0]}, [r0], r1
488
        vst1.32         {d8[1]},  [r0], r1
489
        vst1.32         {d16[1]}, [r0], r1
490
        vst1.32         {d0[1]},  [r0], r1
491
        vst1.32         {d10[1]}, [r0], r1
492
        vst1.32         {d9[0]},  [r0], r1
493
        vst1.32         {d17[0]}, [r0], r1
494
        vst1.32         {d1[0]},  [r0], r1
495
        vst1.32         {d11[0]}, [r0], r1
496
        vst1.32         {d9[1]},  [r0], r1
497
        vst1.32         {d17[1]}, [r0], r1
498
        vst1.32         {d1[1]},  [r0], r1
499
        vst1.32         {d11[1]}, [r0], r1
500 ad74a0f8 Måns Rullgård
501
        align_pop_regs
502
        bx              lr
503
        .endfunc
504
505
        .macro h264_loop_filter_chroma
506
        vdup.8          d22, r2         @ alpha
507
        vmovl.u8        q12, d24
508
        vabd.u8         d26, d16, d0    @ abs(p0 - q0)
509
        vmovl.u8        q2,  d0
510
        vabd.u8         d28, d18, d16   @ abs(p1 - p0)
511
        vsubw.u8        q2,  q2,  d16
512
        vsli.16         d24, d24, #8
513
        vshl.i16        q2,  q2,  #2
514
        vabd.u8         d30, d2,  d0    @ abs(q1 - q0)
515
        vaddw.u8        q2,  q2,  d18
516
        vclt.u8         d26, d26, d22   @ < alpha
517
        vsubw.u8        q2,  q2,  d2
518
        vdup.8          d22, r3         @ beta
519
        vclt.s8         d25, d24, #0
520
        vrshrn.i16      d4,  q2,  #3
521
        vclt.u8         d28, d28, d22   @ < beta
522
        vbic            d26, d26, d25
523
        vclt.u8         d30, d30, d22   @ < beta
524
        vand            d26, d26, d28
525
        vneg.s8         d25, d24
526
        vand            d26, d26, d30
527
        vmin.s8         d4,  d4,  d24
528
        vmovl.u8        q14, d16
529
        vand            d4,  d4,  d26
530
        vmax.s8         d4,  d4,  d25
531
        vmovl.u8        q11, d0
532
        vaddw.s8        q14, q14, d4
533
        vsubw.s8        q11, q11, d4
534
        vqmovun.s16     d16, q14
535
        vqmovun.s16     d0,  q11
536
        .endm
537
538
function ff_h264_v_loop_filter_chroma_neon, export=1
539
        h264_loop_filter_start
540
541
        sub             r0,  r0,  r1, lsl #1
542
        vld1.64         {d18}, [r0,:64], r1
543
        vld1.64         {d16}, [r0,:64], r1
544
        vld1.64         {d0},  [r0,:64], r1
545
        vld1.64         {d2},  [r0,:64]
546
547
        h264_loop_filter_chroma
548
549
        sub             r0,  r0,  r1, lsl #1
550
        vst1.64         {d16}, [r0,:64], r1
551
        vst1.64         {d0},  [r0,:64], r1
552
553
        bx              lr
554
        .endfunc
555
556
function ff_h264_h_loop_filter_chroma_neon, export=1
557
        h264_loop_filter_start
558
559
        sub             r0,  r0,  #2
560
        vld1.32         {d18[0]}, [r0], r1
561
        vld1.32         {d16[0]}, [r0], r1
562
        vld1.32         {d0[0]},  [r0], r1
563
        vld1.32         {d2[0]},  [r0], r1
564
        vld1.32         {d18[1]}, [r0], r1
565
        vld1.32         {d16[1]}, [r0], r1
566
        vld1.32         {d0[1]},  [r0], r1
567
        vld1.32         {d2[1]},  [r0], r1
568
569
        vtrn.16         d18, d0
570
        vtrn.16         d16, d2
571
        vtrn.8          d18, d16
572
        vtrn.8          d0,  d2
573
574
        h264_loop_filter_chroma
575
576
        vtrn.16         d18, d0
577
        vtrn.16         d16, d2
578
        vtrn.8          d18, d16
579
        vtrn.8          d0,  d2
580
581
        sub             r0,  r0,  r1, lsl #3
582
        vst1.32         {d18[0]}, [r0], r1
583
        vst1.32         {d16[0]}, [r0], r1
584
        vst1.32         {d0[0]},  [r0], r1
585
        vst1.32         {d2[0]},  [r0], r1
586
        vst1.32         {d18[1]}, [r0], r1
587
        vst1.32         {d16[1]}, [r0], r1
588
        vst1.32         {d0[1]},  [r0], r1
589
        vst1.32         {d2[1]},  [r0], r1
590
591
        bx              lr
592
        .endfunc
593 5813e05d Måns Rullgård
594
        /* H.264 qpel MC */
595
596
        .macro  lowpass_const r
597
        movw            \r,  #5
598
        movt            \r,  #20
599
        vmov.32         d6[0], \r
600
        .endm
601
602
        .macro  lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
603
.if \narrow
604
        t0 .req q0
605
        t1 .req q8
606
.else
607
        t0 .req \d0
608
        t1 .req \d1
609
.endif
610
        vext.8          d2,  \r0, \r1, #2
611
        vext.8          d3,  \r0, \r1, #3
612
        vaddl.u8        q1,  d2,  d3
613
        vext.8          d4,  \r0, \r1, #1
614
        vext.8          d5,  \r0, \r1, #4
615
        vaddl.u8        q2,  d4,  d5
616
        vext.8          d30, \r0, \r1, #5
617
        vaddl.u8        t0,  \r0, d30
618
        vext.8          d18, \r2, \r3, #2
619
        vmla.i16        t0,  q1,  d6[1]
620
        vext.8          d19, \r2, \r3, #3
621
        vaddl.u8        q9,  d18, d19
622
        vext.8          d20, \r2, \r3, #1
623
        vmls.i16        t0,  q2,  d6[0]
624
        vext.8          d21, \r2, \r3, #4
625
        vaddl.u8        q10, d20, d21
626
        vext.8          d31, \r2, \r3, #5
627
        vaddl.u8        t1,  \r2, d31
628
        vmla.i16        t1,  q9,  d6[1]
629
        vmls.i16        t1,  q10, d6[0]
630
.if \narrow
631
        vqrshrun.s16    \d0, t0,  #5
632
        vqrshrun.s16    \d1, t1,  #5
633
.endif
634
        .unreq  t0
635
        .unreq  t1
636
        .endm
637
638
        .macro  lowpass_8_1 r0, r1, d0, narrow=1
639
.if \narrow
640
        t0 .req q0
641
.else
642
        t0 .req \d0
643
.endif
644
        vext.8          d2,  \r0, \r1, #2
645
        vext.8          d3,  \r0, \r1, #3
646
        vaddl.u8        q1,  d2,  d3
647
        vext.8          d4,  \r0, \r1, #1
648
        vext.8          d5,  \r0, \r1, #4
649
        vaddl.u8        q2,  d4,  d5
650
        vext.8          d30, \r0, \r1, #5
651
        vaddl.u8        t0,  \r0, d30
652
        vmla.i16        t0,  q1,  d6[1]
653
        vmls.i16        t0,  q2,  d6[0]
654
.if \narrow
655
        vqrshrun.s16    \d0, t0,  #5
656
.endif
657
        .unreq  t0
658
        .endm
659
660
        .macro  lowpass_8.16 r0, r1, l0, h0, l1, h1, d
661
        vext.16         q1,  \r0, \r1, #2
662
        vext.16         q0,  \r0, \r1, #3
663
        vaddl.s16       q9,  d2,  d0
664
        vext.16         q2,  \r0, \r1, #1
665
        vaddl.s16       q1,  d3,  d1
666
        vext.16         q3,  \r0, \r1, #4
667
        vaddl.s16       q10, d4,  d6
668
        vext.16         \r1, \r0, \r1, #5
669
        vaddl.s16       q2,  d5,  d7
670
        vaddl.s16       q0,  \h0, \h1
671
        vaddl.s16       q8,  \l0, \l1
672
673
        vshl.i32        q3,  q9,  #4
674
        vshl.i32        q9,  q9,  #2
675
        vshl.i32        q15, q10, #2
676
        vadd.i32        q9,  q9,  q3
677
        vadd.i32        q10, q10, q15
678
679
        vshl.i32        q3,  q1,  #4
680
        vshl.i32        q1,  q1,  #2
681
        vshl.i32        q15, q2,  #2
682
        vadd.i32        q1,  q1,  q3
683
        vadd.i32        q2,  q2,  q15
684
685
        vadd.i32        q9,  q9,  q8
686
        vsub.i32        q9,  q9,  q10
687
688
        vadd.i32        q1,  q1,  q0
689
        vsub.i32        q1,  q1,  q2
690
691
        vrshrn.s32      d18, q9,  #10
692
        vrshrn.s32      d19, q1,  #10
693
694
        vqmovun.s16     \d,  q9
695
        .endm
696
697
function put_h264_qpel16_h_lowpass_neon_packed
698
        mov             r4,  lr
699
        mov             ip,  #16
700
        mov             r3,  #8
701
        bl              put_h264_qpel8_h_lowpass_neon
702
        sub             r1,  r1,  r2, lsl #4
703
        add             r1,  r1,  #8
704
        mov             ip,  #16
705
        mov             lr,  r4
706
        b               put_h264_qpel8_h_lowpass_neon
707
        .endfunc
708
709
function put_h264_qpel16_h_lowpass_neon
710
        push            {lr}
711
        mov             ip,  #16
712
        bl              put_h264_qpel8_h_lowpass_neon
713
        sub             r0,  r0,  r3, lsl #4
714
        sub             r1,  r1,  r2, lsl #4
715
        add             r0,  r0,  #8
716
        add             r1,  r1,  #8
717
        mov             ip,  #16
718
        pop             {lr}
719
        .endfunc
720
721
function put_h264_qpel8_h_lowpass_neon
722
1:      vld1.64         {d0, d1},  [r1], r2
723
        vld1.64         {d16,d17}, [r1], r2
724
        subs            ip,  ip,  #2
725
        lowpass_8       d0,  d1,  d16, d17, d0,  d16
726
        vst1.64         {d0},     [r0,:64], r3
727
        vst1.64         {d16},    [r0,:64], r3
728
        bne             1b
729
        bx              lr
730
        .endfunc
731
732
function put_h264_qpel16_h_lowpass_l2_neon
733
        push            {lr}
734
        mov             ip,  #16
735
        bl              put_h264_qpel8_h_lowpass_l2_neon
736
        sub             r0,  r0,  r2, lsl #4
737
        sub             r1,  r1,  r2, lsl #4
738
        sub             r3,  r3,  r2, lsl #4
739
        add             r0,  r0,  #8
740
        add             r1,  r1,  #8
741
        add             r3,  r3,  #8
742
        mov             ip,  #16
743
        pop             {lr}
744
        .endfunc
745
746
function put_h264_qpel8_h_lowpass_l2_neon
747
1:      vld1.64         {d0, d1},  [r1], r2
748
        vld1.64         {d16,d17}, [r1], r2
749
        vld1.64         {d28},     [r3], r2
750
        vld1.64         {d29},     [r3], r2
751
        subs            ip,  ip,  #2
752
        lowpass_8       d0,  d1,  d16, d17, d0,  d1
753
        vrhadd.u8       q0,  q0,  q14
754
        vst1.64         {d0},      [r0,:64], r2
755
        vst1.64         {d1},      [r0,:64], r2
756
        bne             1b
757
        bx              lr
758
        .endfunc
759
760
function put_h264_qpel16_v_lowpass_neon_packed
761
        mov             r4,  lr
762
        mov             r2,  #8
763
        bl              put_h264_qpel8_v_lowpass_neon
764
        sub             r1,  r1,  r3, lsl #2
765
        bl              put_h264_qpel8_v_lowpass_neon
766
        sub             r1,  r1,  r3, lsl #4
767
        sub             r1,  r1,  r3, lsl #2
768
        add             r1,  r1,  #8
769
        bl              put_h264_qpel8_v_lowpass_neon
770
        sub             r1,  r1,  r3, lsl #2
771
        mov             lr,  r4
772
        b               put_h264_qpel8_v_lowpass_neon
773
        .endfunc
774
775
function put_h264_qpel16_v_lowpass_neon
776
        mov             r4,  lr
777
        bl              put_h264_qpel8_v_lowpass_neon
778
        sub             r1,  r1,  r3, lsl #2
779
        bl              put_h264_qpel8_v_lowpass_neon
780
        sub             r0,  r0,  r2, lsl #4
781
        add             r0,  r0,  #8
782
        sub             r1,  r1,  r3, lsl #4
783
        sub             r1,  r1,  r3, lsl #2
784
        add             r1,  r1,  #8
785
        bl              put_h264_qpel8_v_lowpass_neon
786
        sub             r1,  r1,  r3, lsl #2
787
        mov             lr,  r4
788
        .endfunc
789
790
function put_h264_qpel8_v_lowpass_neon
791
        vld1.64         {d8},  [r1], r3
792
        vld1.64         {d10}, [r1], r3
793
        vld1.64         {d12}, [r1], r3
794
        vld1.64         {d14}, [r1], r3
795
        vld1.64         {d22}, [r1], r3
796
        vld1.64         {d24}, [r1], r3
797
        vld1.64         {d26}, [r1], r3
798
        vld1.64         {d28}, [r1], r3
799
        vld1.64         {d9},  [r1], r3
800
        vld1.64         {d11}, [r1], r3
801
        vld1.64         {d13}, [r1], r3
802
        vld1.64         {d15}, [r1], r3
803
        vld1.64         {d23}, [r1]
804
805
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
806
        lowpass_8       d8,  d9,  d10, d11, d8,  d10
807
        lowpass_8       d12, d13, d14, d15, d12, d14
808
        lowpass_8       d22, d23, d24, d25, d22, d24
809
        lowpass_8       d26, d27, d28, d29, d26, d28
810
        transpose_8x8   d8,  d10, d12, d14, d22, d24, d26, d28
811
812
        vst1.64         {d8},  [r0,:64], r2
813
        vst1.64         {d10}, [r0,:64], r2
814
        vst1.64         {d12}, [r0,:64], r2
815
        vst1.64         {d14}, [r0,:64], r2
816
        vst1.64         {d22}, [r0,:64], r2
817
        vst1.64         {d24}, [r0,:64], r2
818
        vst1.64         {d26}, [r0,:64], r2
819
        vst1.64         {d28}, [r0,:64], r2
820
821
        bx              lr
822
        .endfunc
823
824
function put_h264_qpel16_v_lowpass_l2_neon
825
        mov             r4,  lr
826
        bl              put_h264_qpel8_v_lowpass_l2_neon
827
        sub             r1,  r1,  r3, lsl #2
828
        bl              put_h264_qpel8_v_lowpass_l2_neon
829
        sub             r0,  r0,  r3, lsl #4
830
        sub             ip,  ip,  r2, lsl #4
831
        add             r0,  r0,  #8
832
        add             ip,  ip,  #8
833
        sub             r1,  r1,  r3, lsl #4
834
        sub             r1,  r1,  r3, lsl #2
835
        add             r1,  r1,  #8
836
        bl              put_h264_qpel8_v_lowpass_l2_neon
837
        sub             r1,  r1,  r3, lsl #2
838
        mov             lr,  r4
839
        .endfunc
840
841
function put_h264_qpel8_v_lowpass_l2_neon
842
        vld1.64         {d8},  [r1], r3
843
        vld1.64         {d10}, [r1], r3
844
        vld1.64         {d12}, [r1], r3
845
        vld1.64         {d14}, [r1], r3
846
        vld1.64         {d22}, [r1], r3
847
        vld1.64         {d24}, [r1], r3
848
        vld1.64         {d26}, [r1], r3
849
        vld1.64         {d28}, [r1], r3
850
        vld1.64         {d9},  [r1], r3
851
        vld1.64         {d11}, [r1], r3
852
        vld1.64         {d13}, [r1], r3
853
        vld1.64         {d15}, [r1], r3
854
        vld1.64         {d23}, [r1]
855
856
        transpose_8x8   q4,  q5,  q6,  q7,  q11, q12, q13, q14
857
        lowpass_8       d8,  d9,  d10, d11, d8,  d9
858
        lowpass_8       d12, d13, d14, d15, d12, d13
859
        lowpass_8       d22, d23, d24, d25, d22, d23
860
        lowpass_8       d26, d27, d28, d29, d26, d27
861
        transpose_8x8   d8,  d9,  d12, d13, d22, d23, d26, d27
862
863
        vld1.64         {d0},  [ip], r2
864
        vld1.64         {d1},  [ip], r2
865
        vld1.64         {d2},  [ip], r2
866
        vld1.64         {d3},  [ip], r2
867
        vld1.64         {d4},  [ip], r2
868
        vrhadd.u8       q0,  q0,  q4
869
        vld1.64         {d5},  [ip], r2
870
        vrhadd.u8       q1,  q1,  q6
871
        vld1.64         {d10}, [ip], r2
872
        vrhadd.u8       q2,  q2,  q11
873
        vld1.64         {d11}, [ip], r2
874
875
        vst1.64         {d0},  [r0,:64], r3
876
        vst1.64         {d1},  [r0,:64], r3
877
        vrhadd.u8       q5,  q5,  q13
878
        vst1.64         {d2},  [r0,:64], r3
879
        vst1.64         {d3},  [r0,:64], r3
880
        vst1.64         {d4},  [r0,:64], r3
881
        vst1.64         {d5},  [r0,:64], r3
882
        vst1.64         {d10}, [r0,:64], r3
883
        vst1.64         {d11}, [r0,:64], r3
884
885
        bx              lr
886
        .endfunc
887
888
function put_h264_qpel8_hv_lowpass_neon_top
889
        lowpass_const   ip
890
        mov             ip,  #12
891
1:      vld1.64         {d0, d1},  [r1], r3
892
        vld1.64         {d16,d17}, [r1], r3
893
        subs            ip,  ip,  #2
894
        lowpass_8       d0,  d1,  d16, d17, q11, q12, narrow=0
895
        vst1.64         {d22-d25}, [r4,:128]!
896
        bne             1b
897
898
        vld1.64         {d0, d1},  [r1]
899
        lowpass_8_1     d0,  d1,  q12, narrow=0
900
901
        mov             ip,  #-16
902
        add             r4,  r4,  ip
903
        vld1.64         {d30,d31}, [r4,:128], ip
904
        vld1.64         {d20,d21}, [r4,:128], ip
905
        vld1.64         {d18,d19}, [r4,:128], ip
906
        vld1.64         {d16,d17}, [r4,:128], ip
907
        vld1.64         {d14,d15}, [r4,:128], ip
908
        vld1.64         {d12,d13}, [r4,:128], ip
909
        vld1.64         {d10,d11}, [r4,:128], ip
910
        vld1.64         {d8, d9},  [r4,:128], ip
911
        vld1.64         {d6, d7},  [r4,:128], ip
912
        vld1.64         {d4, d5},  [r4,:128], ip
913
        vld1.64         {d2, d3},  [r4,:128], ip
914
        vld1.64         {d0, d1},  [r4,:128]
915
916
        swap4           d1,  d3,  d5,  d7,  d8,  d10, d12, d14
917
        transpose16_4x4 q0,  q1,  q2,  q3,  q4,  q5,  q6,  q7
918
919
        swap4           d17, d19, d21, d31, d24, d26, d28, d22
920
        transpose16_4x4 q8,  q9,  q10, q15, q12, q13, q14, q11
921
922
        vst1.64         {d30,d31}, [r4,:128]!
923
        vst1.64         {d6, d7},  [r4,:128]!
924
        vst1.64         {d20,d21}, [r4,:128]!
925
        vst1.64         {d4, d5},  [r4,:128]!
926
        vst1.64         {d18,d19}, [r4,:128]!
927
        vst1.64         {d2, d3},  [r4,:128]!
928
        vst1.64         {d16,d17}, [r4,:128]!
929
        vst1.64         {d0, d1},  [r4,:128]
930
931
        lowpass_8.16    q4,  q12, d8,  d9,  d24, d25, d8
932
        lowpass_8.16    q5,  q13, d10, d11, d26, d27, d9
933
        lowpass_8.16    q6,  q14, d12, d13, d28, d29, d10
934
        lowpass_8.16    q7,  q11, d14, d15, d22, d23, d11
935
936
        vld1.64         {d16,d17}, [r4,:128], ip
937
        vld1.64         {d30,d31}, [r4,:128], ip
938
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d12
939
        vld1.64         {d16,d17}, [r4,:128], ip
940
        vld1.64         {d30,d31}, [r4,:128], ip
941
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d13
942
        vld1.64         {d16,d17}, [r4,:128], ip
943
        vld1.64         {d30,d31}, [r4,:128], ip
944
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d14
945
        vld1.64         {d16,d17}, [r4,:128], ip
946
        vld1.64         {d30,d31}, [r4,:128]
947
        lowpass_8.16    q8,  q15, d16, d17, d30, d31, d15
948
949
        transpose_8x8   d12, d13, d14, d15, d8,  d9,  d10, d11
950
951
        bx              lr
952
        .endfunc
953
954
function put_h264_qpel8_hv_lowpass_neon
955
        mov             r10, lr
956
        bl              put_h264_qpel8_hv_lowpass_neon_top
957
        vst1.64         {d12},     [r0,:64], r2
958
        vst1.64         {d13},     [r0,:64], r2
959
        vst1.64         {d14},     [r0,:64], r2
960
        vst1.64         {d15},     [r0,:64], r2
961
        vst1.64         {d8},      [r0,:64], r2
962
        vst1.64         {d9},      [r0,:64], r2
963
        vst1.64         {d10},     [r0,:64], r2
964
        vst1.64         {d11},     [r0,:64], r2
965
966
        mov             lr,  r10
967
        bx              lr
968
        .endfunc
969
970
function put_h264_qpel8_hv_lowpass_l2_neon
971
        mov             r10, lr
972
        bl              put_h264_qpel8_hv_lowpass_neon_top
973
974
        vld1.64         {d0, d1},  [r2,:128]!
975
        vld1.64         {d2, d3},  [r2,:128]!
976
        vrhadd.u8       q0,  q0,  q6
977
        vld1.64         {d4, d5},  [r2,:128]!
978
        vrhadd.u8       q1,  q1,  q7
979
        vld1.64         {d6, d7},  [r2,:128]!
980
        vrhadd.u8       q2,  q2,  q4
981
982
        vst1.64         {d0},      [r0,:64], r3
983
        vrhadd.u8       q3,  q3,  q5
984
        vst1.64         {d1},      [r0,:64], r3
985
        vst1.64         {d2},      [r0,:64], r3
986
        vst1.64         {d3},      [r0,:64], r3
987
        vst1.64         {d4},      [r0,:64], r3
988
        vst1.64         {d5},      [r0,:64], r3
989
        vst1.64         {d6},      [r0,:64], r3
990
        vst1.64         {d7},      [r0,:64], r3
991
992
        mov             lr,  r10
993
        bx              lr
994
        .endfunc
995
996
function put_h264_qpel16_hv_lowpass_neon
997
        mov             r9,  lr
998
        bl              put_h264_qpel8_hv_lowpass_neon
999
        sub             r1,  r1,  r3, lsl #2
1000
        bl              put_h264_qpel8_hv_lowpass_neon
1001
        sub             r1,  r1,  r3, lsl #4
1002
        sub             r1,  r1,  r3, lsl #2
1003
        add             r1,  r1,  #8
1004
        sub             r0,  r0,  r2, lsl #4
1005
        add             r0,  r0,  #8
1006
        bl              put_h264_qpel8_hv_lowpass_neon
1007
        sub             r1,  r1,  r3, lsl #2
1008
        mov             lr,  r9
1009
        b               put_h264_qpel8_hv_lowpass_neon
1010
        .endfunc
1011
1012
function put_h264_qpel16_hv_lowpass_l2_neon
1013
        mov             r9,  lr
1014
        sub             r2,  r4,  #256
1015
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1016
        sub             r1,  r1,  r3, lsl #2
1017
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1018
        sub             r1,  r1,  r3, lsl #4
1019
        sub             r1,  r1,  r3, lsl #2
1020
        add             r1,  r1,  #8
1021
        sub             r0,  r0,  r3, lsl #4
1022
        add             r0,  r0,  #8
1023
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1024
        sub             r1,  r1,  r3, lsl #2
1025
        mov             lr,  r9
1026
        b               put_h264_qpel8_hv_lowpass_l2_neon
1027
        .endfunc
1028
1029
function ff_put_h264_qpel8_mc10_neon, export=1
1030
        lowpass_const   r3
1031
        mov             r3,  r1
1032
        sub             r1,  r1,  #2
1033
        mov             ip,  #8
1034
        b               put_h264_qpel8_h_lowpass_l2_neon
1035
        .endfunc
1036
1037
function ff_put_h264_qpel8_mc20_neon, export=1
1038
        lowpass_const   r3
1039
        sub             r1,  r1,  #2
1040
        mov             r3,  r2
1041
        mov             ip,  #8
1042
        b               put_h264_qpel8_h_lowpass_neon
1043
        .endfunc
1044
1045
function ff_put_h264_qpel8_mc30_neon, export=1
1046
        lowpass_const   r3
1047
        add             r3,  r1,  #1
1048
        sub             r1,  r1,  #2
1049
        mov             ip,  #8
1050
        b               put_h264_qpel8_h_lowpass_l2_neon
1051
        .endfunc
1052
1053
function ff_put_h264_qpel8_mc01_neon, export=1
1054
        push            {lr}
1055
        mov             ip,  r1
1056
put_h264_qpel8_mc01:
1057
        lowpass_const   r3
1058
        mov             r3,  r2
1059
        sub             r1,  r1,  r2, lsl #1
1060
        vpush           {d8-d15}
1061
        bl              put_h264_qpel8_v_lowpass_l2_neon
1062
        vpop            {d8-d15}
1063
        pop             {pc}
1064
        .endfunc
1065
1066
function ff_put_h264_qpel8_mc11_neon, export=1
1067 0115b3ea Måns Rullgård
        push            {r0, r1, r11, lr}
1068 5813e05d Måns Rullgård
put_h264_qpel8_mc11:
1069
        lowpass_const   r3
1070 0115b3ea Måns Rullgård
        mov             r11, sp
1071
        bic             sp,  sp,  #15
1072 5813e05d Måns Rullgård
        sub             sp,  sp,  #64
1073
        mov             r0,  sp
1074
        sub             r1,  r1,  #2
1075
        mov             r3,  #8
1076
        mov             ip,  #8
1077
        vpush           {d8-d15}
1078
        bl              put_h264_qpel8_h_lowpass_neon
1079 0115b3ea Måns Rullgård
        ldrd            r0,  [r11]
1080 5813e05d Måns Rullgård
        mov             r3,  r2
1081
        add             ip,  sp,  #64
1082
        sub             r1,  r1,  r2, lsl #1
1083
        mov             r2,  #8
1084
        bl              put_h264_qpel8_v_lowpass_l2_neon
1085
        vpop            {d8-d15}
1086 0115b3ea Måns Rullgård
        add             sp,  r11, #8
1087
        pop             {r11, pc}
1088 5813e05d Måns Rullgård
        .endfunc
1089
1090
function ff_put_h264_qpel8_mc21_neon, export=1
1091
        push            {r0, r1, r4, r10, r11, lr}
1092
put_h264_qpel8_mc21:
1093
        lowpass_const   r3
1094
        mov             r11, sp
1095
        bic             sp,  sp,  #15
1096
        sub             sp,  sp,  #(8*8+16*12)
1097
        sub             r1,  r1,  #2
1098
        mov             r3,  #8
1099
        mov             r0,  sp
1100
        mov             ip,  #8
1101
        vpush           {d8-d15}
1102
        bl              put_h264_qpel8_h_lowpass_neon
1103
        mov             r4,  r0
1104
        ldrd            r0,  [r11]
1105
        sub             r1,  r1,  r2, lsl #1
1106
        sub             r1,  r1,  #2
1107
        mov             r3,  r2
1108
        sub             r2,  r4,  #64
1109
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1110
        vpop            {d8-d15}
1111
        add             sp,  r11,  #8
1112
        pop             {r4, r10, r11, pc}
1113
        .endfunc
1114
1115
function ff_put_h264_qpel8_mc31_neon, export=1
1116
        add             r1,  r1,  #1
1117 0115b3ea Måns Rullgård
        push            {r0, r1, r11, lr}
1118 5813e05d Måns Rullgård
        sub             r1,  r1,  #1
1119
        b               put_h264_qpel8_mc11
1120
        .endfunc
1121
1122
function ff_put_h264_qpel8_mc02_neon, export=1
1123
        push            {lr}
1124
        lowpass_const   r3
1125
        sub             r1,  r1,  r2, lsl #1
1126
        mov             r3,  r2
1127
        vpush           {d8-d15}
1128
        bl              put_h264_qpel8_v_lowpass_neon
1129
        vpop            {d8-d15}
1130
        pop             {pc}
1131
        .endfunc
1132
1133
function ff_put_h264_qpel8_mc12_neon, export=1
1134
        push            {r0, r1, r4, r10, r11, lr}
1135
put_h264_qpel8_mc12:
1136
        lowpass_const   r3
1137
        mov             r11, sp
1138
        bic             sp,  sp,  #15
1139
        sub             sp,  sp,  #(8*8+16*12)
1140
        sub             r1,  r1,  r2, lsl #1
1141
        mov             r3,  r2
1142
        mov             r2,  #8
1143
        mov             r0,  sp
1144
        vpush           {d8-d15}
1145
        bl              put_h264_qpel8_v_lowpass_neon
1146
        mov             r4,  r0
1147
        ldrd            r0,  [r11]
1148
        sub             r1,  r1,  r3, lsl #1
1149
        sub             r1,  r1,  #2
1150
        sub             r2,  r4,  #64
1151
        bl              put_h264_qpel8_hv_lowpass_l2_neon
1152
        vpop            {d8-d15}
1153
        add             sp,  r11,  #8
1154
        pop             {r4, r10, r11, pc}
1155
        .endfunc
1156
1157
function ff_put_h264_qpel8_mc22_neon, export=1
1158
        push            {r4, r10, r11, lr}
1159
        mov             r11, sp
1160
        bic             sp,  sp,  #15
1161
        sub             r1,  r1,  r2, lsl #1
1162
        sub             r1,  r1,  #2
1163
        mov             r3,  r2
1164
        sub             sp,  sp,  #(16*12)
1165
        mov             r4,  sp
1166
        vpush           {d8-d15}
1167
        bl              put_h264_qpel8_hv_lowpass_neon
1168
        vpop            {d8-d15}
1169
        mov             sp,  r11
1170
        pop             {r4, r10, r11, pc}
1171
        .endfunc
1172
1173
function ff_put_h264_qpel8_mc32_neon, export=1
1174
        push            {r0, r1, r4, r10, r11, lr}
1175
        add             r1,  r1,  #1
1176
        b               put_h264_qpel8_mc12
1177
        .endfunc
1178
1179
function ff_put_h264_qpel8_mc03_neon, export=1
1180
        push            {lr}
1181
        add             ip,  r1,  r2
1182
        b               put_h264_qpel8_mc01
1183
        .endfunc
1184
1185
function ff_put_h264_qpel8_mc13_neon, export=1
1186 0115b3ea Måns Rullgård
        push            {r0, r1, r11, lr}
1187 5813e05d Måns Rullgård
        add             r1,  r1,  r2
1188
        b               put_h264_qpel8_mc11
1189
        .endfunc
1190
1191
function ff_put_h264_qpel8_mc23_neon, export=1
1192
        push            {r0, r1, r4, r10, r11, lr}
1193
        add             r1,  r1,  r2
1194
        b               put_h264_qpel8_mc21
1195
        .endfunc
1196
1197
function ff_put_h264_qpel8_mc33_neon, export=1
1198
        add             r1,  r1,  #1
1199 0115b3ea Måns Rullgård
        push            {r0, r1, r11, lr}
1200 5813e05d Måns Rullgård
        add             r1,  r1,  r2
1201
        sub             r1,  r1,  #1
1202
        b               put_h264_qpel8_mc11
1203
        .endfunc
1204
1205
function ff_put_h264_qpel16_mc10_neon, export=1
1206
        lowpass_const   r3
1207
        mov             r3,  r1
1208
        sub             r1,  r1,  #2
1209
        b               put_h264_qpel16_h_lowpass_l2_neon
1210
        .endfunc
1211
1212
function ff_put_h264_qpel16_mc20_neon, export=1
1213
        lowpass_const   r3
1214
        sub             r1,  r1,  #2
1215
        mov             r3,  r2
1216
        b               put_h264_qpel16_h_lowpass_neon
1217
        .endfunc
1218
1219
function ff_put_h264_qpel16_mc30_neon, export=1
1220
        lowpass_const   r3
1221
        add             r3,  r1,  #1
1222
        sub             r1,  r1,  #2
1223
        b               put_h264_qpel16_h_lowpass_l2_neon
1224
        .endfunc
1225
1226
function ff_put_h264_qpel16_mc01_neon, export=1
1227
        push            {r4, lr}
1228
        mov             ip,  r1
1229
put_h264_qpel16_mc01:
1230
        lowpass_const   r3
1231
        mov             r3,  r2
1232
        sub             r1,  r1,  r2, lsl #1
1233
        vpush           {d8-d15}
1234
        bl              put_h264_qpel16_v_lowpass_l2_neon
1235
        vpop            {d8-d15}
1236
        pop             {r4, pc}
1237
        .endfunc
1238
1239
function ff_put_h264_qpel16_mc11_neon, export=1
1240 0115b3ea Måns Rullgård
        push            {r0, r1, r4, r11, lr}
1241 5813e05d Måns Rullgård
put_h264_qpel16_mc11:
1242
        lowpass_const   r3
1243 0115b3ea Måns Rullgård
        mov             r11, sp
1244
        bic             sp,  sp,  #15
1245 5813e05d Måns Rullgård
        sub             sp,  sp,  #256
1246
        mov             r0,  sp
1247
        sub             r1,  r1,  #2
1248
        mov             r3,  #16
1249
        vpush           {d8-d15}
1250
        bl              put_h264_qpel16_h_lowpass_neon
1251 0115b3ea Måns Rullgård
        ldrd            r0,  [r11]
1252 5813e05d Måns Rullgård
        mov             r3,  r2
1253
        add             ip,  sp,  #64
1254
        sub             r1,  r1,  r2, lsl #1
1255
        mov             r2,  #16
1256
        bl              put_h264_qpel16_v_lowpass_l2_neon
1257
        vpop            {d8-d15}
1258 0115b3ea Måns Rullgård
        add             sp,  r11, #8
1259
        pop             {r4, r11, pc}
1260 5813e05d Måns Rullgård
        .endfunc
1261
1262
function ff_put_h264_qpel16_mc21_neon, export=1
1263
        push            {r0, r1, r4-r5, r9-r11, lr}
1264
put_h264_qpel16_mc21:
1265
        lowpass_const   r3
1266
        mov             r11, sp
1267
        bic             sp,  sp,  #15
1268
        sub             sp,  sp,  #(16*16+16*12)
1269
        sub             r1,  r1,  #2
1270
        mov             r0,  sp
1271
        vpush           {d8-d15}
1272
        bl              put_h264_qpel16_h_lowpass_neon_packed
1273
        mov             r4,  r0
1274
        ldrd            r0,  [r11]
1275
        sub             r1,  r1,  r2, lsl #1
1276
        sub             r1,  r1,  #2
1277
        mov             r3,  r2
1278
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1279
        vpop            {d8-d15}
1280
        add             sp,  r11,  #8
1281
        pop             {r4-r5, r9-r11, pc}
1282
        .endfunc
1283
1284
function ff_put_h264_qpel16_mc31_neon, export=1
1285
        add             r1,  r1,  #1
1286 0115b3ea Måns Rullgård
        push            {r0, r1, r4, r11, lr}
1287 5813e05d Måns Rullgård
        sub             r1,  r1,  #1
1288
        b               put_h264_qpel16_mc11
1289
        .endfunc
1290
1291
function ff_put_h264_qpel16_mc02_neon, export=1
1292
        push            {r4, lr}
1293
        lowpass_const   r3
1294
        sub             r1,  r1,  r2, lsl #1
1295
        mov             r3,  r2
1296
        vpush           {d8-d15}
1297
        bl              put_h264_qpel16_v_lowpass_neon
1298
        vpop            {d8-d15}
1299
        pop             {r4, pc}
1300
        .endfunc
1301
1302
function ff_put_h264_qpel16_mc12_neon, export=1
1303
        push            {r0, r1, r4-r5, r9-r11, lr}
1304
put_h264_qpel16_mc12:
1305
        lowpass_const   r3
1306
        mov             r11, sp
1307
        bic             sp,  sp,  #15
1308
        sub             sp,  sp,  #(16*16+16*12)
1309
        sub             r1,  r1,  r2, lsl #1
1310
        mov             r0,  sp
1311
        mov             r3,  r2
1312
        vpush           {d8-d15}
1313
        bl              put_h264_qpel16_v_lowpass_neon_packed
1314
        mov             r4,  r0
1315
        ldrd            r0,  [r11]
1316
        sub             r1,  r1,  r3, lsl #1
1317
        sub             r1,  r1,  #2
1318
        mov             r2,  r3
1319
        bl              put_h264_qpel16_hv_lowpass_l2_neon
1320
        vpop            {d8-d15}
1321
        add             sp,  r11,  #8
1322
        pop             {r4-r5, r9-r11, pc}
1323
        .endfunc
1324
1325
function ff_put_h264_qpel16_mc22_neon, export=1
1326
        push            {r4, r9-r11, lr}
1327
        lowpass_const   r3
1328
        mov             r11, sp
1329
        bic             sp,  sp,  #15
1330
        sub             r1,  r1,  r2, lsl #1
1331
        sub             r1,  r1,  #2
1332
        mov             r3,  r2
1333
        sub             sp,  sp,  #(16*12)
1334
        mov             r4,  sp
1335
        vpush           {d8-d15}
1336
        bl              put_h264_qpel16_hv_lowpass_neon
1337
        vpop            {d8-d15}
1338
        mov             sp,  r11
1339
        pop             {r4, r9-r11, pc}
1340
        .endfunc
1341
1342
function ff_put_h264_qpel16_mc32_neon, export=1
1343
        push            {r0, r1, r4-r5, r9-r11, lr}
1344
        add             r1,  r1,  #1
1345
        b               put_h264_qpel16_mc12
1346
        .endfunc
1347
1348
function ff_put_h264_qpel16_mc03_neon, export=1
1349
        push            {r4, lr}
1350
        add             ip,  r1,  r2
1351
        b               put_h264_qpel16_mc01
1352
        .endfunc
1353
1354
function ff_put_h264_qpel16_mc13_neon, export=1
1355 0115b3ea Måns Rullgård
        push            {r0, r1, r4, r11, lr}
1356 5813e05d Måns Rullgård
        add             r1,  r1,  r2
1357
        b               put_h264_qpel16_mc11
1358
        .endfunc
1359
1360
function ff_put_h264_qpel16_mc23_neon, export=1
1361
        push            {r0, r1, r4-r5, r9-r11, lr}
1362
        add             r1,  r1,  r2
1363
        b               put_h264_qpel16_mc21
1364
        .endfunc
1365
1366
function ff_put_h264_qpel16_mc33_neon, export=1
1367
        add             r1,  r1,  #1
1368 0115b3ea Måns Rullgård
        push            {r0, r1, r4, r11, lr}
1369 5813e05d Måns Rullgård
        add             r1,  r1,  r2
1370
        sub             r1,  r1,  #1
1371
        b               put_h264_qpel16_mc11
1372
        .endfunc
1373 5a29589b Måns Rullgård
1374
@ Biweighted prediction
1375
1376
        .macro  biweight_16 macs, macd
1377
        vdup.8          d0,  r4
1378
        vdup.8          d1,  r5
1379
        vmov            q2,  q8
1380
        vmov            q3,  q8
1381
1:      subs            ip,  ip,  #2
1382
        vld1.8          {d20-d21},[r0,:128], r2
1383
        \macd           q2,  d0,  d20
1384
        pld             [r0]
1385
        \macd           q3,  d0,  d21
1386
        vld1.8          {d22-d23},[r1,:128], r2
1387
        \macs           q2,  d1,  d22
1388
        pld             [r1]
1389
        \macs           q3,  d1,  d23
1390
        vmov            q12, q8
1391
        vld1.8          {d28-d29},[r0,:128], r2
1392
        vmov            q13, q8
1393
        \macd           q12, d0,  d28
1394
        pld             [r0]
1395
        \macd           q13, d0,  d29
1396
        vld1.8          {d30-d31},[r1,:128], r2
1397
        \macs           q12, d1,  d30
1398
        pld             [r1]
1399
        \macs           q13, d1,  d31
1400
        vshl.s16        q2,  q2,  q9
1401
        vshl.s16        q3,  q3,  q9
1402
        vqmovun.s16     d4,  q2
1403
        vqmovun.s16     d5,  q3
1404
        vshl.s16        q12, q12, q9
1405
        vshl.s16        q13, q13, q9
1406
        vqmovun.s16     d24, q12
1407
        vqmovun.s16     d25, q13
1408
        vmov            q3,  q8
1409
        vst1.8          {d4- d5}, [r6,:128], r2
1410
        vmov            q2,  q8
1411
        vst1.8          {d24-d25},[r6,:128], r2
1412
        bne             1b
1413
        pop             {r4-r6, pc}
1414
        .endm
1415
1416
        .macro  biweight_8 macs, macd
1417
        vdup.8          d0,  r4
1418
        vdup.8          d1,  r5
1419
        vmov            q1,  q8
1420
        vmov            q10, q8
1421
1:      subs            ip,  ip,  #2
1422
        vld1.8          {d4},[r0,:64], r2
1423
        \macd           q1,  d0,  d4
1424
        pld             [r0]
1425
        vld1.8          {d5},[r1,:64], r2
1426
        \macs           q1,  d1,  d5
1427
        pld             [r1]
1428
        vld1.8          {d6},[r0,:64], r2
1429
        \macd           q10, d0,  d6
1430
        pld             [r0]
1431
        vld1.8          {d7},[r1,:64], r2
1432
        \macs           q10, d1,  d7
1433
        pld             [r1]
1434
        vshl.s16        q1,  q1,  q9
1435
        vqmovun.s16     d2,  q1
1436
        vshl.s16        q10, q10, q9
1437
        vqmovun.s16     d4,  q10
1438
        vmov            q10, q8
1439
        vst1.8          {d2},[r6,:64], r2
1440
        vmov            q1,  q8
1441
        vst1.8          {d4},[r6,:64], r2
1442
        bne             1b
1443
        pop             {r4-r6, pc}
1444
        .endm
1445
1446
        .macro  biweight_4 macs, macd
1447
        vdup.8          d0,  r4
1448
        vdup.8          d1,  r5
1449
        vmov            q1,  q8
1450
        vmov            q10, q8
1451
1:      subs            ip,  ip,  #4
1452
        vld1.32         {d4[0]},[r0,:32], r2
1453
        vld1.32         {d4[1]},[r0,:32], r2
1454
        \macd           q1,  d0,  d4
1455
        pld             [r0]
1456
        vld1.32         {d5[0]},[r1,:32], r2
1457
        vld1.32         {d5[1]},[r1,:32], r2
1458
        \macs           q1,  d1,  d5
1459
        pld             [r1]
1460
        blt             2f
1461
        vld1.32         {d6[0]},[r0,:32], r2
1462
        vld1.32         {d6[1]},[r0,:32], r2
1463
        \macd           q10, d0,  d6
1464
        pld             [r0]
1465
        vld1.32         {d7[0]},[r1,:32], r2
1466
        vld1.32         {d7[1]},[r1,:32], r2
1467
        \macs           q10, d1,  d7
1468
        pld             [r1]
1469
        vshl.s16        q1,  q1,  q9
1470
        vqmovun.s16     d2,  q1
1471
        vshl.s16        q10, q10, q9
1472
        vqmovun.s16     d4,  q10
1473
        vmov            q10, q8
1474
        vst1.32         {d2[0]},[r6,:32], r2
1475
        vst1.32         {d2[1]},[r6,:32], r2
1476
        vmov            q1,  q8
1477
        vst1.32         {d4[0]},[r6,:32], r2
1478
        vst1.32         {d4[1]},[r6,:32], r2
1479
        bne             1b
1480
        pop             {r4-r6, pc}
1481
2:      vshl.s16        q1,  q1,  q9
1482
        vqmovun.s16     d2,  q1
1483
        vst1.32         {d2[0]},[r6,:32], r2
1484
        vst1.32         {d2[1]},[r6,:32], r2
1485
        pop             {r4-r6, pc}
1486
        .endm
1487
1488
        .macro  biweight_func w
1489
function biweight_h264_pixels_\w\()_neon
1490
        push            {r4-r6, lr}
1491
        add             r4,  sp,  #16
1492
        ldm             r4,  {r4-r6}
1493
        lsr             lr,  r4,  #31
1494
        add             r6,  r6,  #1
1495
        eors            lr,  lr,  r5,  lsr #30
1496
        orr             r6,  r6,  #1
1497
        vdup.16         q9,  r3
1498
        lsl             r6,  r6,  r3
1499
        vmvn            q9,  q9
1500
        vdup.16         q8,  r6
1501
        mov             r6,  r0
1502
        beq             10f
1503
        subs            lr,  lr,  #1
1504
        beq             20f
1505
        subs            lr,  lr,  #1
1506
        beq             30f
1507
        b               40f
1508
10:     biweight_\w     vmlal.u8, vmlal.u8
1509
20:     rsb             r4,  r4,  #0
1510
        biweight_\w     vmlal.u8, vmlsl.u8
1511
30:     rsb             r4,  r4,  #0
1512
        rsb             r5,  r5,  #0
1513
        biweight_\w     vmlsl.u8, vmlsl.u8
1514
40:     rsb             r5,  r5,  #0
1515
        biweight_\w     vmlsl.u8, vmlal.u8
1516
        .endfunc
1517
        .endm
1518
1519
        .macro  biweight_entry w, h, b=1
1520
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1521
        mov             ip,  #\h
1522
.if \b
1523
        b               biweight_h264_pixels_\w\()_neon
1524
.endif
1525
        .endfunc
1526
        .endm
1527
1528
        biweight_entry  16, 8
1529
        biweight_entry  16, 16, b=0
1530
        biweight_func   16
1531
1532
        biweight_entry  8,  16
1533
        biweight_entry  8,  4
1534
        biweight_entry  8,  8,  b=0
1535
        biweight_func   8
1536
1537
        biweight_entry  4,  8
1538
        biweight_entry  4,  2
1539
        biweight_entry  4,  4,  b=0
1540
        biweight_func   4
1541 bd53b426 Måns Rullgård
1542
@ Weighted prediction
1543
1544 fe7f149e Måns Rullgård
        .macro  weight_16 add
1545 bd53b426 Måns Rullgård
        vdup.8          d0,  r3
1546
1:      subs            ip,  ip,  #2
1547
        vld1.8          {d20-d21},[r0,:128], r1
1548 fe7f149e Måns Rullgård
        vmull.u8        q2,  d0,  d20
1549 bd53b426 Måns Rullgård
        pld             [r0]
1550 fe7f149e Måns Rullgård
        vmull.u8        q3,  d0,  d21
1551 bd53b426 Måns Rullgård
        vld1.8          {d28-d29},[r0,:128], r1
1552 fe7f149e Måns Rullgård
        vmull.u8        q12, d0,  d28
1553 bd53b426 Måns Rullgård
        pld             [r0]
1554 fe7f149e Måns Rullgård
        vmull.u8        q13, d0,  d29
1555
        \add            q2,  q8,  q2
1556
        vrshl.s16       q2,  q2,  q9
1557
        \add            q3,  q8,  q3
1558
        vrshl.s16       q3,  q3,  q9
1559 bd53b426 Måns Rullgård
        vqmovun.s16     d4,  q2
1560
        vqmovun.s16     d5,  q3
1561 fe7f149e Måns Rullgård
        \add            q12, q8,  q12
1562
        vrshl.s16       q12, q12, q9
1563
        \add            q13, q8,  q13
1564
        vrshl.s16       q13, q13, q9
1565 bd53b426 Måns Rullgård
        vqmovun.s16     d24, q12
1566
        vqmovun.s16     d25, q13
1567
        vst1.8          {d4- d5}, [r4,:128], r1
1568
        vst1.8          {d24-d25},[r4,:128], r1
1569
        bne             1b
1570
        pop             {r4, pc}
1571
        .endm
1572
1573 fe7f149e Måns Rullgård
        .macro  weight_8 add
1574 bd53b426 Måns Rullgård
        vdup.8          d0,  r3
1575
1:      subs            ip,  ip,  #2
1576
        vld1.8          {d4},[r0,:64], r1
1577 fe7f149e Måns Rullgård
        vmull.u8        q1,  d0,  d4
1578 bd53b426 Måns Rullgård
        pld             [r0]
1579
        vld1.8          {d6},[r0,:64], r1
1580 fe7f149e Måns Rullgård
        vmull.u8        q10, d0,  d6
1581
        \add            q1,  q8,  q1
1582 bd53b426 Måns Rullgård
        pld             [r0]
1583 fe7f149e Måns Rullgård
        vrshl.s16       q1,  q1,  q9
1584 bd53b426 Måns Rullgård
        vqmovun.s16     d2,  q1
1585 fe7f149e Måns Rullgård
        \add            q10, q8,  q10
1586
        vrshl.s16       q10, q10, q9
1587 bd53b426 Måns Rullgård
        vqmovun.s16     d4,  q10
1588
        vst1.8          {d2},[r4,:64], r1
1589
        vst1.8          {d4},[r4,:64], r1
1590
        bne             1b
1591
        pop             {r4, pc}
1592
        .endm
1593
1594 fe7f149e Måns Rullgård
        .macro  weight_4 add
1595 bd53b426 Måns Rullgård
        vdup.8          d0,  r3
1596
        vmov            q1,  q8
1597
        vmov            q10, q8
1598
1:      subs            ip,  ip,  #4
1599
        vld1.32         {d4[0]},[r0,:32], r1
1600
        vld1.32         {d4[1]},[r0,:32], r1
1601 fe7f149e Måns Rullgård
        vmull.u8        q1,  d0,  d4
1602 bd53b426 Måns Rullgård
        pld             [r0]
1603
        blt             2f
1604
        vld1.32         {d6[0]},[r0,:32], r1
1605
        vld1.32         {d6[1]},[r0,:32], r1
1606 fe7f149e Måns Rullgård
        vmull.u8        q10, d0,  d6
1607 bd53b426 Måns Rullgård
        pld             [r0]
1608 fe7f149e Måns Rullgård
        \add            q1,  q8,  q1
1609
        vrshl.s16       q1,  q1,  q9
1610 bd53b426 Måns Rullgård
        vqmovun.s16     d2,  q1
1611 fe7f149e Måns Rullgård
        \add            q10, q8,  q10
1612
        vrshl.s16       q10, q10, q9
1613 bd53b426 Måns Rullgård
        vqmovun.s16     d4,  q10
1614
        vmov            q10, q8
1615
        vst1.32         {d2[0]},[r4,:32], r1
1616
        vst1.32         {d2[1]},[r4,:32], r1
1617
        vmov            q1,  q8
1618
        vst1.32         {d4[0]},[r4,:32], r1
1619
        vst1.32         {d4[1]},[r4,:32], r1
1620
        bne             1b
1621
        pop             {r4, pc}
1622 fe7f149e Måns Rullgård
2:      \add            q1,  q8,  q1
1623
        vrshl.s16       q1,  q1,  q9
1624 bd53b426 Måns Rullgård
        vqmovun.s16     d2,  q1
1625
        vst1.32         {d2[0]},[r4,:32], r1
1626
        vst1.32         {d2[1]},[r4,:32], r1
1627
        pop             {r4, pc}
1628
        .endm
1629
1630
        .macro  weight_func w
1631
function weight_h264_pixels_\w\()_neon
1632
        push            {r4, lr}
1633
        ldr             r4,  [sp, #8]
1634 fe7f149e Måns Rullgård
        cmp             r2,  #1
1635 bd53b426 Måns Rullgård
        lsl             r4,  r4,  r2
1636
        vdup.16         q8,  r4
1637
        mov             r4,  r0
1638 fe7f149e Måns Rullgård
        ble             20f
1639
        rsb             lr,  r2,  #1
1640
        vdup.16         q9,  lr
1641
        cmp             r3,  #0
1642
        blt             10f
1643
        weight_\w       vhadd.s16
1644
10:     rsb             r3,  r3,  #0
1645
        weight_\w       vhsub.s16
1646
20:     rsb             lr,  r2,  #0
1647
        vdup.16         q9,  lr
1648
        cmp             r3,  #0
1649 bd53b426 Måns Rullgård
        blt             10f
1650 fe7f149e Måns Rullgård
        weight_\w       vadd.s16
1651 bd53b426 Måns Rullgård
10:     rsb             r3,  r3,  #0
1652 fe7f149e Måns Rullgård
        weight_\w       vsub.s16
1653 bd53b426 Måns Rullgård
        .endfunc
1654
        .endm
1655
1656
        .macro  weight_entry w, h, b=1
1657
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1658
        mov             ip,  #\h
1659
.if \b
1660
        b               weight_h264_pixels_\w\()_neon
1661
.endif
1662
        .endfunc
1663
        .endm
1664
1665
        weight_entry    16, 8
1666
        weight_entry    16, 16, b=0
1667
        weight_func     16
1668
1669
        weight_entry    8,  16
1670
        weight_entry    8,  4
1671
        weight_entry    8,  8,  b=0
1672
        weight_func     8
1673
1674
        weight_entry    4,  8
1675
        weight_entry    4,  2
1676
        weight_entry    4,  4,  b=0
1677
        weight_func     4