Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / vp8dsp_neon.S @ ef15d71c

History | View | Annotate | Download (66.4 KB)

1 ef15d71c Mans Rullgard
/**
2
 * VP8 NEON optimisations
3
 *
4
 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23
24
#include "asm.S"
25
26
function ff_vp8_luma_dc_wht_neon, export=1
27
        vld1.16         {q0-q1},  [r1,:128]
28
        vmov.i16        q15, #0
29
30
        vadd.i16        d4,  d0,  d3
31
        vadd.i16        d6,  d1,  d2
32
        vst1.16         {q15},    [r1,:128]!
33
        vsub.i16        d7,  d1,  d2
34
        vsub.i16        d5,  d0,  d3
35
        vst1.16         {q15},    [r1,:128]
36
        vadd.i16        q0,  q2,  q3
37
        vsub.i16        q1,  q2,  q3
38
39
        vmov.i16        q8, #3
40
41
        vtrn.32         d0,  d2
42
        vtrn.32         d1,  d3
43
        vtrn.16         d0,  d1
44
        vtrn.16         d2,  d3
45
46
        vadd.i16        d0,  d0,  d16
47
48
        vadd.i16        d4,  d0,  d3
49
        vadd.i16        d6,  d1,  d2
50
        vsub.i16        d7,  d1,  d2
51
        vsub.i16        d5,  d0,  d3
52
        vadd.i16        q0,  q2,  q3
53
        vsub.i16        q1,  q2,  q3
54
55
        vshr.s16        q0,  q0,  #3
56
        vshr.s16        q1,  q1,  #3
57
58
        mov             r3,  #32
59
        vst1.16         {d0[0]},  [r0,:16], r3
60
        vst1.16         {d1[0]},  [r0,:16], r3
61
        vst1.16         {d2[0]},  [r0,:16], r3
62
        vst1.16         {d3[0]},  [r0,:16], r3
63
        vst1.16         {d0[1]},  [r0,:16], r3
64
        vst1.16         {d1[1]},  [r0,:16], r3
65
        vst1.16         {d2[1]},  [r0,:16], r3
66
        vst1.16         {d3[1]},  [r0,:16], r3
67
        vst1.16         {d0[2]},  [r0,:16], r3
68
        vst1.16         {d1[2]},  [r0,:16], r3
69
        vst1.16         {d2[2]},  [r0,:16], r3
70
        vst1.16         {d3[2]},  [r0,:16], r3
71
        vst1.16         {d0[3]},  [r0,:16], r3
72
        vst1.16         {d1[3]},  [r0,:16], r3
73
        vst1.16         {d2[3]},  [r0,:16], r3
74
        vst1.16         {d3[3]},  [r0,:16], r3
75
76
        bx              lr
77
endfunc
78
79
function ff_vp8_luma_dc_wht_dc_neon, export=1
80
        ldrsh           r2,  [r1]
81
        mov             r3,  #0
82
        add             r2,  r2,  #3
83
        strh            r3,  [r1]
84
        asr             r2,  r2,  #3
85
    .rept 16
86
        strh            r2,  [r0], #32
87
    .endr
88
        bx              lr
89
endfunc
90
91
function ff_vp8_idct_add_neon, export=1
92
        vld1.16         {q0-q1},  [r1,:128]
93
        movw            r3,  #20091
94
        movt            r3,  #35468/2
95
        vdup.32         d4,  r3
96
97
        vmull.s16       q12, d1,  d4[0]
98
        vmull.s16       q13, d3,  d4[0]
99
        vqdmulh.s16     d20, d1,  d4[1]
100
        vqdmulh.s16     d23, d3,  d4[1]
101
        vshrn.s32       d21, q12, #16
102
        vshrn.s32       d22, q13, #16
103
        vadd.s16        d21, d21, d1
104
        vadd.s16        d22, d22, d3
105
106
        vadd.s16        d16, d0,  d2
107
        vsub.s16        d17, d0,  d2
108
        vadd.s16        d18, d21, d23
109
        vsub.s16        d19, d20, d22
110
        vadd.s16        q0,  q8,  q9
111
        vsub.s16        q1,  q8,  q9
112
113
        vtrn.32         d0,  d3
114
        vtrn.32         d1,  d2
115
        vtrn.16         d0,  d1
116
        vtrn.16         d3,  d2
117
118
        vmov.i16        q15, #0
119
        vmull.s16       q12, d1,  d4[0]
120
        vst1.16         {q15},    [r1,:128]!
121
        vmull.s16       q13, d2,  d4[0]
122
        vst1.16         {q15},    [r1,:128]
123
        vqdmulh.s16     d21, d1,  d4[1]
124
        vqdmulh.s16     d23, d2,  d4[1]
125
        vshrn.s32       d20, q12, #16
126
        vshrn.s32       d22, q13, #16
127
        vadd.i16        d20, d20, d1
128
        vadd.i16        d22, d22, d2
129
130
        vadd.i16        d16, d0,  d3
131
        vsub.i16        d17, d0,  d3
132
        vadd.i16        d18, d20, d23
133
        vld1.32         {d20[]},  [r0,:32], r2
134
        vsub.i16        d19, d21, d22
135
        vld1.32         {d22[]},  [r0,:32], r2
136
        vadd.s16        q0,  q8,  q9
137
        vld1.32         {d23[]},  [r0,:32], r2
138
        vsub.s16        q1,  q8,  q9
139
        vld1.32         {d21[]},  [r0,:32], r2
140
        vrshr.s16       q0,  q0,  #3
141
        vtrn.32         q10, q11
142
        vrshr.s16       q1,  q1,  #3
143
144
        sub             r0,  r0,  r2,  lsl #2
145
146
        vtrn.32         d0,  d3
147
        vtrn.32         d1,  d2
148
        vtrn.16         d0,  d1
149
        vtrn.16         d3,  d2
150
151
        vaddw.u8        q0,  q0,  d20
152
        vaddw.u8        q1,  q1,  d21
153
        vqmovun.s16     d0,  q0
154
        vqmovun.s16     d1,  q1
155
156
        vst1.32         {d0[0]},  [r0,:32], r2
157
        vst1.32         {d0[1]},  [r0,:32], r2
158
        vst1.32         {d1[1]},  [r0,:32], r2
159
        vst1.32         {d1[0]},  [r0,:32], r2
160
161
        bx              lr
162
endfunc
163
164
function ff_vp8_idct_dc_add_neon, export=1
165
        mov             r3,  #0
166
        ldrsh           r12, [r1]
167
        strh            r3,  [r1]
168
        vdup.16         q1,  r12
169
        vrshr.s16       q1,  q1,  #3
170
        vld1.32         {d0[]},   [r0,:32], r2
171
        vld1.32         {d1[]},   [r0,:32], r2
172
        vld1.32         {d0[1]},  [r0,:32], r2
173
        vld1.32         {d1[1]},  [r0,:32], r2
174
        vaddw.u8        q2,  q1,  d0
175
        vaddw.u8        q3,  q1,  d1
176
        sub             r0,  r0,  r2, lsl #2
177
        vqmovun.s16     d0,  q2
178
        vqmovun.s16     d1,  q3
179
        vst1.32         {d0[0]},  [r0,:32], r2
180
        vst1.32         {d1[0]},  [r0,:32], r2
181
        vst1.32         {d0[1]},  [r0,:32], r2
182
        vst1.32         {d1[1]},  [r0,:32], r2
183
        bx              lr
184
endfunc
185
186
function ff_vp8_idct_dc_add4uv_neon, export=1
187
        vmov.i16        d0,  #0
188
        mov             r3,  #32
189
        vld1.16         {d16[]},  [r1,:16]
190
        vst1.16         {d0[0]},  [r1,:16], r3
191
        vld1.16         {d17[]},  [r1,:16]
192
        vst1.16         {d0[0]},  [r1,:16], r3
193
        vld1.16         {d18[]},  [r1,:16]
194
        vst1.16         {d0[0]},  [r1,:16], r3
195
        vld1.16         {d19[]},  [r1,:16]
196
        vst1.16         {d0[0]},  [r1,:16], r3
197
        mov             r3,  r0
198
        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
199
        vld1.8          {d0},     [r0,:64], r2
200
        vrshr.s16       q9,  q9,  #3
201
        vld1.8          {d1},     [r0,:64], r2
202
        vaddw.u8        q10, q8,  d0
203
        vld1.8          {d2},     [r0,:64], r2
204
        vaddw.u8        q0,  q8,  d1
205
        vld1.8          {d3},     [r0,:64], r2
206
        vaddw.u8        q11, q8,  d2
207
        vld1.8          {d4},     [r0,:64], r2
208
        vaddw.u8        q1,  q8,  d3
209
        vld1.8          {d5},     [r0,:64], r2
210
        vaddw.u8        q12, q9,  d4
211
        vld1.8          {d6},     [r0,:64], r2
212
        vaddw.u8        q2,  q9,  d5
213
        vld1.8          {d7},     [r0,:64], r2
214
        vaddw.u8        q13, q9,  d6
215
        vqmovun.s16     d20, q10
216
        vaddw.u8        q3,  q9,  d7
217
        vqmovun.s16     d21, q0
218
        vqmovun.s16     d22, q11
219
        vst1.8          {d20},    [r3,:64], r2
220
        vqmovun.s16     d23, q1
221
        vst1.8          {d21},    [r3,:64], r2
222
        vqmovun.s16     d24, q12
223
        vst1.8          {d22},    [r3,:64], r2
224
        vqmovun.s16     d25, q2
225
        vst1.8          {d23},    [r3,:64], r2
226
        vqmovun.s16     d26, q13
227
        vst1.8          {d24},    [r3,:64], r2
228
        vqmovun.s16     d27, q3
229
        vst1.8          {d25},    [r3,:64], r2
230
        vst1.8          {d26},    [r3,:64], r2
231
        vst1.8          {d27},    [r3,:64], r2
232
233
        bx              lr
234
endfunc
235
236
function ff_vp8_idct_dc_add4y_neon, export=1
237
        vmov.i16        d0,  #0
238
        mov             r3,  #32
239
        vld1.16         {d16[]},  [r1,:16]
240
        vst1.16         {d0[0]},  [r1,:16], r3
241
        vld1.16         {d17[]},  [r1,:16]
242
        vst1.16         {d0[0]},  [r1,:16], r3
243
        vld1.16         {d18[]},  [r1,:16]
244
        vst1.16         {d0[0]},  [r1,:16], r3
245
        vld1.16         {d19[]},  [r1,:16]
246
        vst1.16         {d0[0]},  [r1,:16], r3
247
        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
248
        vld1.8          {q0},     [r0,:128], r2
249
        vrshr.s16       q9,  q9,  #3
250
        vld1.8          {q1},     [r0,:128], r2
251
        vaddw.u8        q10, q8,  d0
252
        vld1.8          {q2},     [r0,:128], r2
253
        vaddw.u8        q0,  q9,  d1
254
        vld1.8          {q3},     [r0,:128], r2
255
        vaddw.u8        q11, q8,  d2
256
        vaddw.u8        q1,  q9,  d3
257
        vaddw.u8        q12, q8,  d4
258
        vaddw.u8        q2,  q9,  d5
259
        vaddw.u8        q13, q8,  d6
260
        vaddw.u8        q3,  q9,  d7
261
        sub             r0,  r0,  r2,  lsl #2
262
        vqmovun.s16     d20, q10
263
        vqmovun.s16     d21, q0
264
        vqmovun.s16     d22, q11
265
        vqmovun.s16     d23, q1
266
        vqmovun.s16     d24, q12
267
        vst1.8          {q10},    [r0,:128], r2
268
        vqmovun.s16     d25, q2
269
        vst1.8          {q11},    [r0,:128], r2
270
        vqmovun.s16     d26, q13
271
        vst1.8          {q12},    [r0,:128], r2
272
        vqmovun.s16     d27, q3
273
        vst1.8          {q13},    [r0,:128], r2
274
275
        bx              lr
276
endfunc
277
278
@ Register layout:
279
@   P3..Q3 -> q0..q7
280
@   flim_E -> q14
281
@   flim_I -> q15
282
@   hev_thresh -> r12
283
@
284
.macro  vp8_loop_filter, inner=0, simple=0
285
    .if \simple
286
        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
287
        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
288
        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
289
        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
290
        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
291
        vmov.i8         q13, #0x80
292
        vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
293
    .else
294
        @ calculate hev and normal_limit:
295
        vabd.u8         q12, q2,  q3            @ abs(P1-P0)
296
        vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
297
        vabd.u8         q10, q0,  q1            @ abs(P3-P2)
298
        vabd.u8         q11, q1,  q2            @ abs(P2-P1)
299
        vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
300
        vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
301
        vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
302
        vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
303
        vand            q8,  q8,  q9
304
        vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
305
        vand            q8,  q8,  q11
306
        vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
307
        vand            q8,  q8,  q10
308
        vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
309
        vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
310
        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
311
        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
312
        vand            q8,  q8,  q10
313
        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
314
        vand            q8,  q8,  q11
315
        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
316
        vdup.8          q15, r12                @ hev_thresh
317
        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318
        vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
319
        vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320
        vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
321
        vand            q8,  q8,  q11
322
        vmov.i8         q13, #0x80
323
        vorr            q9,  q12, q14
324
    .endif
325
326
        @ at this point:
327
        @   q8: normal_limit
328
        @   q9: hev
329
330
        @ convert to signed value:
331
        veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
332
        veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
333
334
        vmov.i16        q12, #3
335
        vsubl.s8        q10, d8,  d6            @ QS0 - PS0
336
        vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
337
        veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
338
        veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
339
        vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
340
        vmul.i16        q11, q11, q12
341
342
        vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
343
        vmov.i8         q14, #4
344
        vmov.i8         q15, #3
345
    .if \inner
346
        vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
347
    .endif
348
        vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
349
        vaddw.s8        q11, q11, d25
350
        vqmovn.s16      d20, q10                @ narrow result back into q10
351
        vqmovn.s16      d21, q11
352
    .if !\inner && !\simple
353
        veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
354
        veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
355
    .endif
356
        vand            q10, q10, q8            @ w &= normal_limit
357
358
        @ registers used at this point..
359
        @   q0 -> P3  (don't corrupt)
360
        @   q1-q6 -> PS2-QS2
361
        @   q7 -> Q3  (don't corrupt)
362
        @   q9 -> hev
363
        @   q10 -> w
364
        @   q13 -> #0x80
365
        @   q14 -> #4
366
        @   q15 -> #3
367
        @   q8, q11, q12 -> unused
368
369
        @ filter_common:   is4tap==1
370
        @   c1 = clamp(w + 4) >> 3;
371
        @   c2 = clamp(w + 3) >> 3;
372
        @   Q0 = s2u(QS0 - c1);
373
        @   P0 = s2u(PS0 + c2);
374
375
    .if \simple
376
        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
377
        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
378
        vshr.s8         q11, q11, #3            @ c1 >>= 3
379
        vshr.s8         q12, q12, #3            @ c2 >>= 3
380
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
381
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
382
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
383
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
384
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
385
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
386
    .elseif \inner
387
        @ the !is4tap case of filter_common, only used for inner blocks
388
        @   c3 = ((c1&~hev) + 1) >> 1;
389
        @   Q1 = s2u(QS1 - c3);
390
        @   P1 = s2u(PS1 + c3);
391
        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
392
        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
393
        vshr.s8         q11, q11, #3            @ c1 >>= 3
394
        vshr.s8         q12, q12, #3            @ c2 >>= 3
395
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
396
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
397
        vbic            q11, q11, q9            @ c1 & ~hev
398
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
399
        vrshr.s8        q11, q11, #1            @ c3 >>= 1
400
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
401
        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
402
        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
403
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
404
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
405
    .else
406
        vand            q12, q10, q9            @ w & hev
407
        vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
408
        vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
409
        vshr.s8         q11, q11, #3            @ c1 >>= 3
410
        vshr.s8         q12, q12, #3            @ c2 >>= 3
411
        vbic            q10, q10, q9            @ w &= ~hev
412
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
413
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
414
415
        @ filter_mbedge:
416
        @   a = clamp((27*w + 63) >> 7);
417
        @   Q0 = s2u(QS0 - a);
418
        @   P0 = s2u(PS0 + a);
419
        @   a = clamp((18*w + 63) >> 7);
420
        @   Q1 = s2u(QS1 - a);
421
        @   P1 = s2u(PS1 + a);
422
        @   a = clamp((9*w + 63) >> 7);
423
        @   Q2 = s2u(QS2 - a);
424
        @   P2 = s2u(PS2 + a);
425
        vmov.i16        q9,  #63
426
        vshll.s8        q14, d20, #3
427
        vshll.s8        q15, d21, #3
428
        vaddw.s8        q14, q14, d20
429
        vaddw.s8        q15, q15, d21
430
        vadd.s16        q8,  q9,  q14
431
        vadd.s16        q9,  q9,  q15           @  9*w + 63
432
        vadd.s16        q11, q8,  q14
433
        vadd.s16        q12, q9,  q15           @ 18*w + 63
434
        vadd.s16        q14, q11, q14
435
        vadd.s16        q15, q12, q15           @ 27*w + 63
436
        vqshrn.s16      d16, q8,  #7
437
        vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
438
        vqshrn.s16      d22, q11, #7
439
        vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
440
        vqshrn.s16      d28, q14, #7
441
        vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
442
        vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
443
        vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
444
        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
445
        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
446
        vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
447
        vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
448
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
449
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
450
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
451
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
452
        veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
453
        veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
454
    .endif
455
.endm
456
457
.macro transpose8x16matrix
458
        vtrn.32         q0,   q4
459
        vtrn.32         q1,   q5
460
        vtrn.32         q2,   q6
461
        vtrn.32         q3,   q7
462
463
        vtrn.16         q0,   q2
464
        vtrn.16         q1,   q3
465
        vtrn.16         q4,   q6
466
        vtrn.16         q5,   q7
467
468
        vtrn.8          q0,   q1
469
        vtrn.8          q2,   q3
470
        vtrn.8          q4,   q5
471
        vtrn.8          q6,   q7
472
.endm
473
474
.macro  vp8_v_loop_filter16 name, inner=0, simple=0
475
function ff_vp8_v_loop_filter16\name\()_neon, export=1
476
        vpush           {q4-q7}
477
        sub             r0,  r0,  r1,  lsl #1+!\simple
478
479
        @ Load pixels:
480
    .if !\simple
481
        ldr             r12, [sp, #64]          @ hev_thresh
482
        vld1.8          {q0},     [r0,:128], r1 @ P3
483
        vld1.8          {q1},     [r0,:128], r1 @ P2
484
    .endif
485
        vld1.8          {q2},     [r0,:128], r1 @ P1
486
        vld1.8          {q3},     [r0,:128], r1 @ P0
487
        vld1.8          {q4},     [r0,:128], r1 @ Q0
488
        vld1.8          {q5},     [r0,:128], r1 @ Q1
489
    .if !\simple
490
        vld1.8          {q6},     [r0,:128], r1 @ Q2
491
        vld1.8          {q7},     [r0,:128]     @ Q3
492
        vdup.8          q15, r3                 @ flim_I
493
    .endif
494
        vdup.8          q14, r2                 @ flim_E
495
496
        vp8_loop_filter inner=\inner, simple=\simple
497
498
        @ back up to P2:  dst -= stride * 6
499
        sub             r0,  r0,  r1,  lsl #2
500
    .if !\simple
501
        sub             r0,  r0,  r1,  lsl #1
502
503
        @ Store pixels:
504
        vst1.8          {q1},     [r0,:128], r1 @ P2
505
    .endif
506
        vst1.8          {q2},     [r0,:128], r1 @ P1
507
        vst1.8          {q3},     [r0,:128], r1 @ P0
508
        vst1.8          {q4},     [r0,:128], r1 @ Q0
509
        vst1.8          {q5},     [r0,:128], r1 @ Q1
510
    .if !\simple
511
        vst1.8          {q6},     [r0,:128]     @ Q2
512
    .endif
513
514
        vpop            {q4-q7}
515
        bx              lr
516
endfunc
517
.endm
518
519
vp8_v_loop_filter16
520
vp8_v_loop_filter16 _inner,  inner=1
521
vp8_v_loop_filter16 _simple, simple=1
522
523
.macro  vp8_v_loop_filter8uv name, inner=0
524
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
525
        vpush           {q4-q7}
526
        sub             r0,  r0,  r2,  lsl #2
527
        sub             r1,  r1,  r2,  lsl #2
528
        ldr             r12, [sp, #64]          @ flim_I
529
530
        @ Load pixels:
531
        vld1.8          {d0},     [r0,:64], r2  @ P3
532
        vld1.8          {d1},     [r1,:64], r2  @ P3
533
        vld1.8          {d2},     [r0,:64], r2  @ P2
534
        vld1.8          {d3},     [r1,:64], r2  @ P2
535
        vld1.8          {d4},     [r0,:64], r2  @ P1
536
        vld1.8          {d5},     [r1,:64], r2  @ P1
537
        vld1.8          {d6},     [r0,:64], r2  @ P0
538
        vld1.8          {d7},     [r1,:64], r2  @ P0
539
        vld1.8          {d8},     [r0,:64], r2  @ Q0
540
        vld1.8          {d9},     [r1,:64], r2  @ Q0
541
        vld1.8          {d10},    [r0,:64], r2  @ Q1
542
        vld1.8          {d11},    [r1,:64], r2  @ Q1
543
        vld1.8          {d12},    [r0,:64], r2  @ Q2
544
        vld1.8          {d13},    [r1,:64], r2  @ Q2
545
        vld1.8          {d14},    [r0,:64]      @ Q3
546
        vld1.8          {d15},    [r1,:64]      @ Q3
547
548
        vdup.8          q14, r3                 @ flim_E
549
        vdup.8          q15, r12                @ flim_I
550
        ldr             r12, [sp, #68]          @ hev_thresh
551
552
        vp8_loop_filter inner=\inner
553
554
        @ back up to P2:  u,v -= stride * 6
555
        sub             r0,  r0,  r2,  lsl #2
556
        sub             r1,  r1,  r2,  lsl #2
557
        sub             r0,  r0,  r2,  lsl #1
558
        sub             r1,  r1,  r2,  lsl #1
559
560
        @ Store pixels:
561
        vst1.8          {d2},     [r0,:64], r2  @ P2
562
        vst1.8          {d3},     [r1,:64], r2  @ P2
563
        vst1.8          {d4},     [r0,:64], r2  @ P1
564
        vst1.8          {d5},     [r1,:64], r2  @ P1
565
        vst1.8          {d6},     [r0,:64], r2  @ P0
566
        vst1.8          {d7},     [r1,:64], r2  @ P0
567
        vst1.8          {d8},     [r0,:64], r2  @ Q0
568
        vst1.8          {d9},     [r1,:64], r2  @ Q0
569
        vst1.8          {d10},    [r0,:64], r2  @ Q1
570
        vst1.8          {d11},    [r1,:64], r2  @ Q1
571
        vst1.8          {d12},    [r0,:64]      @ Q2
572
        vst1.8          {d13},    [r1,:64]      @ Q2
573
574
        vpop            {q4-q7}
575
        bx              lr
576
endfunc
577
.endm
578
579
vp8_v_loop_filter8uv
580
vp8_v_loop_filter8uv _inner, inner=1
581
582
.macro  vp8_h_loop_filter16 name, inner=0, simple=0
583
function ff_vp8_h_loop_filter16\name\()_neon, export=1
584
        vpush           {q4-q7}
585
        sub             r0,  r0,  #4
586
    .if !\simple
587
        ldr             r12, [sp, #64]          @ hev_thresh
588
    .endif
589
590
        @ Load pixels:
591
        vld1.8          {d0},     [r0], r1      @ load first 8-line src data
592
        vld1.8          {d2},     [r0], r1
593
        vld1.8          {d4},     [r0], r1
594
        vld1.8          {d6},     [r0], r1
595
        vld1.8          {d8},     [r0], r1
596
        vld1.8          {d10},    [r0], r1
597
        vld1.8          {d12},    [r0], r1
598
        vld1.8          {d14},    [r0], r1
599
        vld1.8          {d1},     [r0], r1      @ load second 8-line src data
600
        vld1.8          {d3},     [r0], r1
601
        vld1.8          {d5},     [r0], r1
602
        vld1.8          {d7},     [r0], r1
603
        vld1.8          {d9},     [r0], r1
604
        vld1.8          {d11},    [r0], r1
605
        vld1.8          {d13},    [r0], r1
606
        vld1.8          {d15},    [r0], r1
607
608
        transpose8x16matrix
609
610
        vdup.8          q14, r2                 @ flim_E
611
    .if !\simple
612
        vdup.8          q15, r3                 @ flim_I
613
    .endif
614
615
        vp8_loop_filter inner=\inner, simple=\simple
616
617
        sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
618
619
        transpose8x16matrix
620
621
        @ Store pixels:
622
        vst1.8          {d0},     [r0],     r1
623
        vst1.8          {d2},     [r0],     r1
624
        vst1.8          {d4},     [r0],     r1
625
        vst1.8          {d6},     [r0],     r1
626
        vst1.8          {d8},     [r0],     r1
627
        vst1.8          {d10},    [r0],     r1
628
        vst1.8          {d12},    [r0],     r1
629
        vst1.8          {d14},    [r0],     r1
630
        vst1.8          {d1},     [r0],     r1
631
        vst1.8          {d3},     [r0],     r1
632
        vst1.8          {d5},     [r0],     r1
633
        vst1.8          {d7},     [r0],     r1
634
        vst1.8          {d9},     [r0],     r1
635
        vst1.8          {d11},    [r0],     r1
636
        vst1.8          {d13},    [r0],     r1
637
        vst1.8          {d15},    [r0]
638
639
        vpop            {q4-q7}
640
        bx              lr
641
endfunc
642
.endm
643
644
vp8_h_loop_filter16
645
vp8_h_loop_filter16 _inner,  inner=1
646
vp8_h_loop_filter16 _simple, simple=1
647
648
.macro  vp8_h_loop_filter8uv name, inner=0
649
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
650
        vpush           {q4-q7}
651
        sub             r0,  r0,  #4
652
        sub             r1,  r1,  #4
653
        ldr             r12, [sp, #64]          @ flim_I
654
655
        @ Load pixels:
656
        vld1.8          {d0},     [r0], r2      @ load u
657
        vld1.8          {d1},     [r1], r2      @ load v
658
        vld1.8          {d2},     [r0], r2
659
        vld1.8          {d3},     [r1], r2
660
        vld1.8          {d4},     [r0], r2
661
        vld1.8          {d5},     [r1], r2
662
        vld1.8          {d6},     [r0], r2
663
        vld1.8          {d7},     [r1], r2
664
        vld1.8          {d8},     [r0], r2
665
        vld1.8          {d9},     [r1], r2
666
        vld1.8          {d10},    [r0], r2
667
        vld1.8          {d11},    [r1], r2
668
        vld1.8          {d12},    [r0], r2
669
        vld1.8          {d13},    [r1], r2
670
        vld1.8          {d14},    [r0], r2
671
        vld1.8          {d15},    [r1], r2
672
673
        transpose8x16matrix
674
675
        vdup.8          q14, r3                 @ flim_E
676
        vdup.8          q15, r12                @ flim_I
677
        ldr             r12, [sp, #68]          @ hev_thresh
678
679
        vp8_loop_filter inner=\inner
680
681
        sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
682
        sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
683
684
        transpose8x16matrix
685
686
        @ Store pixels:
687
        vst1.8          {d0},     [r0], r2
688
        vst1.8          {d1},     [r1], r2
689
        vst1.8          {d2},     [r0], r2
690
        vst1.8          {d3},     [r1], r2
691
        vst1.8          {d4},     [r0], r2
692
        vst1.8          {d5},     [r1], r2
693
        vst1.8          {d6},     [r0], r2
694
        vst1.8          {d7},     [r1], r2
695
        vst1.8          {d8},     [r0], r2
696
        vst1.8          {d9},     [r1], r2
697
        vst1.8          {d10},    [r0], r2
698
        vst1.8          {d11},    [r1], r2
699
        vst1.8          {d12},    [r0], r2
700
        vst1.8          {d13},    [r1], r2
701
        vst1.8          {d14},    [r0]
702
        vst1.8          {d15},    [r1]
703
704
        vpop            {q4-q7}
705
        bx              lr
706
endfunc
707
.endm
708
709
vp8_h_loop_filter8uv
710
vp8_h_loop_filter8uv _inner, inner=1
711
712
function ff_put_vp8_pixels16_neon, export=1
713
        ldr             r12, [sp, #0]           @ h
714
1:
715
        subs            r12, r12, #4
716
        vld1.8          {q0},     [r2], r3
717
        vld1.8          {q1},     [r2], r3
718
        vld1.8          {q2},     [r2], r3
719
        vld1.8          {q3},     [r2], r3
720
        vst1.8          {q0},     [r0,:128], r1
721
        vst1.8          {q1},     [r0,:128], r1
722
        vst1.8          {q2},     [r0,:128], r1
723
        vst1.8          {q3},     [r0,:128], r1
724
        bgt             1b
725
        bx              lr
726
endfunc
727
728
function ff_put_vp8_pixels8_neon, export=1
729
        ldr             r12, [sp, #0]           @ h
730
1:
731
        subs            r12, r12, #4
732
        vld1.8          {d0},     [r2], r3
733
        vld1.8          {d1},     [r2], r3
734
        vld1.8          {d2},     [r2], r3
735
        vld1.8          {d3},     [r2], r3
736
        vst1.8          {d0},     [r0,:64], r1
737
        vst1.8          {d1},     [r0,:64], r1
738
        vst1.8          {d2},     [r0,:64], r1
739
        vst1.8          {d3},     [r0,:64], r1
740
        bgt             1b
741
        bx              lr
742
endfunc
743
744
function ff_put_vp8_pixels4_neon, export=1
745
        ldr             r12, [sp, #0]           @ h
746
        push            {r4-r6,lr}
747
1:
748
        subs            r12, r12, #4
749
        ldr             r4,       [r2], r3
750
        ldr             r5,       [r2], r3
751
        ldr             r6,       [r2], r3
752
        ldr             lr,       [r2], r3
753
        str             r4,       [r0], r1
754
        str             r5,       [r0], r1
755
        str             r6,       [r0], r1
756
        str             lr,       [r0], r1
757
        bgt             1b
758
        pop             {r4-r6,pc}
759
endfunc
760
761
/* 4/6-tap 8th-pel MC */
762
763
.macro  vp8_epel8_h6    d,   a,   b
764
        vext.8          d27, \a,  \b,  #1
765
        vmovl.u8        q8,  \a
766
        vext.8          d28, \a,  \b,  #2
767
        vmovl.u8        q9,  d27
768
        vext.8          d29, \a,  \b,  #3
769
        vmovl.u8        q10, d28
770
        vext.8          d30, \a,  \b,  #4
771
        vmovl.u8        q11, d29
772
        vext.8          d31, \a,  \b,  #5
773
        vmovl.u8        q12, d30
774
        vmul.u16        q10, q10, d0[2]
775
        vmovl.u8        q13, d31
776
        vmul.u16        q11, q11, d0[3]
777
        vmls.u16        q10, q9,  d0[1]
778
        vmls.u16        q11, q12, d1[0]
779
        vmla.u16        q10, q8,  d0[0]
780
        vmla.u16        q11, q13, d1[1]
781
        vqadd.s16       q11, q10, q11
782
        vqrshrun.s16    \d,  q11, #7
783
.endm
784
785
.macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
786
        vext.8          q14, \q0, \q1, #3
787
        vext.8          q15, \q0, \q1, #4
788
        vmovl.u8        q11, d28
789
        vmovl.u8        q14, d29
790
        vext.8          q3,  \q0, \q1, #2
791
        vmovl.u8        q12, d30
792
        vmovl.u8        q15, d31
793
        vext.8          q8,  \q0, \q1, #1
794
        vmovl.u8        q10, d6
795
        vmovl.u8        q3,  d7
796
        vext.8          q2,  \q0, \q1, #5
797
        vmovl.u8        q13, d4
798
        vmovl.u8        q2,  d5
799
        vmovl.u8        q9,  d16
800
        vmovl.u8        q8,  d17
801
        vmul.u16        q11, q11, d0[3]
802
        vmul.u16        q10, q10, d0[2]
803
        vmul.u16        q3,  q3,  d0[2]
804
        vmul.u16        q14, q14, d0[3]
805
        vmls.u16        q11, q12, d1[0]
806
        vmovl.u8        q12, \s0
807
        vmovl.u8        q1,  \s1
808
        vmls.u16        q10, q9,  d0[1]
809
        vmls.u16        q3,  q8,  d0[1]
810
        vmls.u16        q14, q15, d1[0]
811
        vmla.u16        q10, q12, d0[0]
812
        vmla.u16        q11, q13, d1[1]
813
        vmla.u16        q3,  q1,  d0[0]
814
        vmla.u16        q14, q2,  d1[1]
815
        vqadd.s16       q11, q10, q11
816
        vqadd.s16       q14, q3,  q14
817
        vqrshrun.s16    \d0, q11, #7
818
        vqrshrun.s16    \d1, q14, #7
819
.endm
820
821
.macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
822
        vmovl.u8        q10, \s2
823
        vmovl.u8        q11, \s3
824
        vmovl.u8        q9,  \s1
825
        vmovl.u8        q12, \s4
826
        vmovl.u8        q8,  \s0
827
        vmovl.u8        q13, \s5
828
        vmul.u16        q10, q10, d0[2]
829
        vmul.u16        q11, q11, d0[3]
830
        vmls.u16        q10, q9,  d0[1]
831
        vmls.u16        q11, q12, d1[0]
832
        vmla.u16        q10, q8,  d0[0]
833
        vmla.u16        q11, q13, d1[1]
834
        vqadd.s16       q11, q10, q11
835
        vqrshrun.s16    \d0, q11, #7
836
.endm
837
838
.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
839
        vmovl.u8        q10, \s0
840
        vmovl.u8        q11, \s3
841
        vmovl.u8        q14, \s6
842
        vmovl.u8        q9,  \s1
843
        vmovl.u8        q12, \s4
844
        vmovl.u8        q8,  \s2
845
        vmovl.u8        q13, \s5
846
        vmul.u16        q10, q10, d0[0]
847
        vmul.u16        q15, q11, d0[3]
848
        vmul.u16        q11, q11, d0[2]
849
        vmul.u16        q14, q14, d1[1]
850
        vmls.u16        q10, q9,  d0[1]
851
        vmls.u16        q15, q12, d1[0]
852
        vmls.u16        q11, q8,  d0[1]
853
        vmls.u16        q14, q13, d1[0]
854
        vmla.u16        q10, q8,  d0[2]
855
        vmla.u16        q15, q13, d1[1]
856
        vmla.u16        q11, q9,  d0[0]
857
        vmla.u16        q14, q12, d0[3]
858
        vqadd.s16       q15, q10, q15
859
        vqadd.s16       q14, q11, q14
860
        vqrshrun.s16    \d0, q15, #7
861
        vqrshrun.s16    \d1, q14, #7
862
.endm
863
864
.macro  vp8_epel8_h4    d,   a,   b
865
        vext.8          d28, \a,  \b,  #1
866
        vmovl.u8        q9,  \a
867
        vext.8          d29, \a,  \b,  #2
868
        vmovl.u8        q10, d28
869
        vext.8          d30, \a,  \b,  #3
870
        vmovl.u8        q11, d29
871
        vmovl.u8        q12, d30
872
        vmul.u16        q10, q10, d0[2]
873
        vmul.u16        q11, q11, d0[3]
874
        vmls.u16        q10, q9,  d0[1]
875
        vmls.u16        q11, q12, d1[0]
876
        vqadd.s16       q11, q10, q11
877
        vqrshrun.s16    \d,  q11, #7
878
.endm
879
880
.macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
881
        vmovl.u8        q9,  \s0
882
        vmovl.u8        q10, \s1
883
        vmovl.u8        q11, \s2
884
        vmovl.u8        q12, \s3
885
        vmovl.u8        q13, \s4
886
        vmul.u16        q8,  q10, d0[2]
887
        vmul.u16        q14, q11, d0[3]
888
        vmul.u16        q11, q11, d0[2]
889
        vmul.u16        q15, q12, d0[3]
890
        vmls.u16        q8,  q9,  d0[1]
891
        vmls.u16        q14, q12, d1[0]
892
        vmls.u16        q11, q10, d0[1]
893
        vmls.u16        q15, q13, d1[0]
894
        vqadd.s16       q8,  q8,  q14
895
        vqadd.s16       q11, q11, q15
896
        vqrshrun.s16    \d0, q8,  #7
897
        vqrshrun.s16    \d1, q11, #7
898
.endm
899
900
function ff_put_vp8_epel16_v6_neon, export=1
901
        sub             r2,  r2,  r3,  lsl #1
902
        push            {r4,lr}
903
        vpush           {d8-d15}
904
905
        ldr             r4,  [sp, #80]          @ my
906
        movrel          lr,  subpel_filters-16
907
        ldr             r12, [sp, #72]          @ h
908
        add             r4,  lr,  r4, lsl #4
909
        vld1.16         {q0},     [r4,:128]
910
1:
911
        vld1.8          {d2-d3},  [r2], r3
912
        vld1.8          {d4-d5},  [r2], r3
913
        vld1.8          {d6-d7},  [r2], r3
914
        vld1.8          {d8-d9},  [r2], r3
915
        vld1.8          {d10-d11},[r2], r3
916
        vld1.8          {d12-d13},[r2], r3
917
        vld1.8          {d14-d15},[r2]
918
        sub             r2,  r2,  r3,  lsl #2
919
920
        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
921
        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
922
923
        vst1.8          {d2-d3},  [r0,:128], r1
924
        vst1.8          {d4-d5},  [r0,:128], r1
925
        subs            r12, r12, #2
926
        bne             1b
927
928
        vpop            {d8-d15}
929
        pop             {r4,pc}
930
endfunc
931
932
function ff_put_vp8_epel16_h6_neon, export=1
933
        sub             r2,  r2,  #2
934
        push            {r4,lr}
935
936
        ldr             r4,  [sp, #12]          @ mx
937
        movrel          lr,  subpel_filters-16
938
        ldr             r12, [sp, #8]           @ h
939
        add             r4,  lr,  r4, lsl #4
940
        vld1.16         {q0},     [r4,:128]
941
1:
942
        vld1.8          {d2-d4},  [r2], r3
943
944
        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
945
946
        vst1.8          {d2-d3}, [r0,:128], r1
947
        subs            r12, r12, #1
948
        bne             1b
949
950
        pop             {r4,pc}
951
endfunc
952
953
function ff_put_vp8_epel16_h6v6_neon, export=1
954
        sub             r2,  r2,  r3,  lsl #1
955
        sub             r2,  r2,  #2
956
        push            {r4,lr}
957
        vpush           {d8-d9}
958
959
        @ first pass (horizontal):
960
        ldr             r4,  [sp, #28]          @ mx
961
        movrel          lr,  subpel_filters-16
962
        ldr             r12, [sp, #24]          @ h
963
        add             r4,  lr,  r4, lsl #4
964
        sub             sp,  sp,  #336+16
965
        vld1.16         {q0},     [r4,:128]
966
        add             lr,  sp,  #15
967
        add             r12, r12, #5
968
        bic             lr,  lr,  #15
969
1:
970
        vld1.8          {d2,d3,d4}, [r2], r3
971
972
        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
973
974
        vst1.8          {d2-d3}, [lr,:128]!
975
        subs            r12, r12, #1
976
        bne             1b
977
978
        @ second pass (vertical):
979
        ldr             r4,  [sp, #336+16+32]   @ my
980
        movrel          lr,  subpel_filters-16
981
        ldr             r12, [sp, #336+16+24]   @ h
982
        add             r4,  lr,  r4, lsl #4
983
        add             lr,  sp,  #15
984
        vld1.16         {q0},     [r4,:128]
985
        bic             lr,  lr,  #15
986
2:
987
        vld1.8          {d2-d5},  [lr,:128]!
988
        vld1.8          {d6-d9},  [lr,:128]!
989
        vld1.8          {d28-d31},[lr,:128]
990
        sub             lr,  lr,  #48
991
992
        vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
993
        vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
994
995
        vst1.8          {d2-d3}, [r0,:128], r1
996
        subs            r12, r12, #1
997
        bne             2b
998
999
        add             sp,  sp,  #336+16
1000
        vpop            {d8-d9}
1001
        pop             {r4,pc}
1002
endfunc
1003
1004
function ff_put_vp8_epel8_v6_neon, export=1
1005
        sub             r2,  r2,  r3,  lsl #1
1006
        push            {r4,lr}
1007
1008
        ldr             r4,  [sp, #16]          @ my
1009
        movrel          lr,  subpel_filters-16
1010
        ldr             r12, [sp, #8]           @ h
1011
        add             r4,  lr,  r4, lsl #4
1012
        vld1.16         {q0},     [r4,:128]
1013
1:
1014
        vld1.8          {d2},  [r2], r3
1015
        vld1.8          {d3},  [r2], r3
1016
        vld1.8          {d4},  [r2], r3
1017
        vld1.8          {d5},  [r2], r3
1018
        vld1.8          {d6},  [r2], r3
1019
        vld1.8          {d7},  [r2], r3
1020
        vld1.8          {d28}, [r2]
1021
1022
        sub             r2,  r2,  r3,  lsl #2
1023
1024
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1025
1026
        vst1.8          {d2}, [r0,:64], r1
1027
        vst1.8          {d3}, [r0,:64], r1
1028
        subs            r12, r12, #2
1029
        bne             1b
1030
1031
        pop             {r4,pc}
1032
endfunc
1033
1034
function ff_put_vp8_epel8_h6_neon, export=1
1035
        sub             r2,  r2,  #2
1036
        push            {r4,lr}
1037
1038
        ldr             r4,  [sp, #12]          @ mx
1039
        movrel          lr,  subpel_filters-16
1040
        ldr             r12, [sp, #8]           @ h
1041
        add             r4,  lr,  r4, lsl #4
1042
        vld1.16         {q0},     [r4,:128]
1043
1:
1044
        vld1.8          {d2,d3}, [r2], r3
1045
1046
        vp8_epel8_h6    d2,  d2,  d3
1047
1048
        vst1.8          {d2}, [r0,:64], r1
1049
        subs            r12, r12, #1
1050
        bne             1b
1051
1052
        pop             {r4,pc}
1053
endfunc
1054
1055
function ff_put_vp8_epel8_h6v6_neon, export=1
1056
        sub             r2,  r2,  r3,  lsl #1
1057
        sub             r2,  r2,  #2
1058
        push            {r4,lr}
1059
1060
        @ first pass (horizontal):
1061
        ldr             r4,  [sp, #12]          @ mx
1062
        movrel          lr,  subpel_filters-16
1063
        ldr             r12, [sp, #8]           @ h
1064
        add             r4,  lr,  r4, lsl #4
1065
        sub             sp,  sp,  #168+16
1066
        vld1.16         {q0},     [r4,:128]
1067
        add             lr,  sp,  #15
1068
        add             r12, r12, #5
1069
        bic             lr,  lr,  #15
1070
1:
1071
        vld1.8          {d2,d3}, [r2], r3
1072
1073
        vp8_epel8_h6    d2,  d2,  d3
1074
1075
        vst1.8          {d2}, [lr,:64]!
1076
        subs            r12, r12, #1
1077
        bne             1b
1078
1079
        @ second pass (vertical):
1080
        ldr             r4,  [sp, #168+16+16]   @ my
1081
        movrel          lr,  subpel_filters-16
1082
        ldr             r12, [sp, #168+16+8]    @ h
1083
        add             r4,  lr,  r4, lsl #4
1084
        add             lr,  sp,  #15
1085
        vld1.16         {q0},     [r4,:128]
1086
        bic             lr,  lr,  #15
1087
2:
1088
        vld1.8          {d2-d5},  [lr,:128]!
1089
        vld1.8          {d6-d7},  [lr,:128]!
1090
        vld1.8          {d30},    [lr,:64]
1091
        sub             lr,  lr,  #32
1092
1093
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1094
1095
        vst1.8          {d2}, [r0,:64], r1
1096
        vst1.8          {d3}, [r0,:64], r1
1097
        subs            r12, r12, #2
1098
        bne             2b
1099
1100
        add             sp,  sp,  #168+16
1101
        pop             {r4,pc}
1102
endfunc
1103
1104
function ff_put_vp8_epel8_v4_neon, export=1
1105
        sub             r2,  r2,  r3
1106
        push            {r4,lr}
1107
1108
        ldr             r4,  [sp, #16]          @ my
1109
        movrel          lr,  subpel_filters-16
1110
        ldr             r12, [sp, #8]           @ h
1111
        add             r4,  lr,  r4, lsl #4
1112
        vld1.16         {q0},     [r4,:128]
1113
1:
1114
        vld1.8          {d2},     [r2], r3
1115
        vld1.8          {d3},     [r2], r3
1116
        vld1.8          {d4},     [r2], r3
1117
        vld1.8          {d5},     [r2], r3
1118
        vld1.8          {d6},     [r2]
1119
        sub             r2,  r2,  r3,  lsl #1
1120
1121
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1122
1123
        vst1.8          {d2}, [r0,:64], r1
1124
        vst1.8          {d3}, [r0,:64], r1
1125
        subs            r12, r12, #2
1126
        bne             1b
1127
1128
        pop             {r4,pc}
1129
endfunc
1130
1131
function ff_put_vp8_epel8_h4_neon, export=1
1132
        sub             r2,  r2,  #1
1133
        push            {r4,lr}
1134
1135
        ldr             r4,  [sp, #12]          @ mx
1136
        movrel          lr,  subpel_filters-16
1137
        ldr             r12, [sp, #8]           @ h
1138
        add             r4,  lr,  r4, lsl #4
1139
        vld1.16         {q0},     [r4,:128]
1140
1:
1141
        vld1.8          {d2,d3}, [r2], r3
1142
1143
        vp8_epel8_h4    d2,  d2,  d3
1144
1145
        vst1.8          {d2}, [r0,:64], r1
1146
        subs            r12, r12, #1
1147
        bne             1b
1148
1149
        pop             {r4,pc}
1150
endfunc
1151
1152
function ff_put_vp8_epel8_h4v4_neon, export=1
1153
        sub             r2,  r2,  r3
1154
        sub             r2,  r2,  #1
1155
        push            {r4,lr}
1156
1157
        @ first pass (horizontal):
1158
        ldr             r4,  [sp, #12]          @ mx
1159
        movrel          lr,  subpel_filters-16
1160
        ldr             r12, [sp, #8]           @ h
1161
        add             r4,  lr,  r4, lsl #4
1162
        sub             sp,  sp,  #168+16
1163
        vld1.16         {q0},     [r4,:128]
1164
        add             lr,  sp,  #15
1165
        add             r12, r12, #3
1166
        bic             lr,  lr,  #15
1167
1:
1168
        vld1.8          {d2,d3}, [r2], r3
1169
1170
        vp8_epel8_h4    d2,  d2,  d3
1171
1172
        vst1.8          {d2}, [lr,:64]!
1173
        subs            r12, r12, #1
1174
        bne             1b
1175
1176
        @ second pass (vertical):
1177
        ldr             r4,  [sp, #168+16+16]   @ my
1178
        movrel          lr,  subpel_filters-16
1179
        ldr             r12, [sp, #168+16+8]    @ h
1180
        add             r4,  lr,  r4, lsl #4
1181
        add             lr,  sp,  #15
1182
        vld1.16         {q0},     [r4,:128]
1183
        bic             lr,  lr,  #15
1184
2:
1185
        vld1.8          {d2-d5},  [lr,:128]!
1186
        vld1.8          {d6},     [lr,:64]
1187
        sub             lr,  lr,  #16
1188
1189
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1190
1191
        vst1.8          {d2},     [r0,:64], r1
1192
        vst1.8          {d3},     [r0,:64], r1
1193
        subs            r12, r12, #2
1194
        bne             2b
1195
1196
        add             sp,  sp,  #168+16
1197
        pop             {r4,pc}
1198
endfunc
1199
1200
function ff_put_vp8_epel8_h6v4_neon, export=1
1201
        sub             r2,  r2,  r3
1202
        sub             r2,  r2,  #2
1203
        push            {r4,lr}
1204
1205
        @ first pass (horizontal):
1206
        ldr             r4,  [sp, #12]          @ mx
1207
        movrel          lr,  subpel_filters-16
1208
        ldr             r12, [sp, #8]           @ h
1209
        add             r4,  lr,  r4, lsl #4
1210
        sub             sp,  sp,  #168+16
1211
        vld1.16         {q0},     [r4,:128]
1212
        add             lr,  sp,  #15
1213
        add             r12, r12, #3
1214
        bic             lr,  lr,  #15
1215
1:
1216
        vld1.8          {d2,d3}, [r2], r3
1217
1218
        vp8_epel8_h6    d2,  d2,  d3
1219
1220
        vst1.8          {d2}, [lr,:64]!
1221
        subs            r12, r12, #1
1222
        bne             1b
1223
1224
        @ second pass (vertical):
1225
        ldr             r4,  [sp, #168+16+16]   @ my
1226
        movrel          lr,  subpel_filters-16
1227
        ldr             r12, [sp, #168+16+8]    @ h
1228
        add             r4,  lr,  r4, lsl #4
1229
        add             lr,  sp,  #15
1230
        vld1.16         {q0},     [r4,:128]
1231
        bic             lr,  lr,  #15
1232
2:
1233
        vld1.8          {d2-d5},  [lr,:128]!
1234
        vld1.8          {d6},     [lr,:64]
1235
        sub             lr,  lr,  #16
1236
1237
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1238
1239
        vst1.8          {d2},     [r0,:64], r1
1240
        vst1.8          {d3},     [r0,:64], r1
1241
        subs            r12, r12, #2
1242
        bne             2b
1243
1244
        add             sp,  sp,  #168+16
1245
        pop             {r4,pc}
1246
endfunc
1247
1248
function ff_put_vp8_epel8_h4v6_neon, export=1
1249
        sub             r2,  r2,  r3,  lsl #1
1250
        sub             r2,  r2,  #1
1251
        push            {r4,lr}
1252
1253
        @ first pass (horizontal):
1254
        ldr             r4,  [sp, #12]          @ mx
1255
        movrel          lr,  subpel_filters-16
1256
        ldr             r12, [sp, #8]           @ h
1257
        add             r4,  lr,  r4, lsl #4
1258
        sub             sp,  sp,  #168+16
1259
        vld1.16         {q0},     [r4,:128]
1260
        add             lr,  sp,  #15
1261
        add             r12, r12, #5
1262
        bic             lr,  lr,  #15
1263
1:
1264
        vld1.8          {d2,d3}, [r2], r3
1265
1266
        vp8_epel8_h4    d2,  d2,  d3
1267
1268
        vst1.8          {d2}, [lr,:64]!
1269
        subs            r12, r12, #1
1270
        bne             1b
1271
1272
        @ second pass (vertical):
1273
        ldr             r4,  [sp, #168+16+16]   @ my
1274
        movrel          lr,  subpel_filters-16
1275
        ldr             r12, [sp, #168+16+8]    @ h
1276
        add             r4,  lr,  r4, lsl #4
1277
        add             lr,  sp,  #15
1278
        vld1.16         {q0},     [r4,:128]
1279
        bic             lr,  lr,  #15
1280
2:
1281
        vld1.8          {d2-d5},  [lr,:128]!
1282
        vld1.8          {d6-d7},  [lr,:128]!
1283
        vld1.8          {d30},    [lr,:64]
1284
        sub             lr,  lr,  #32
1285
1286
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1287
1288
        vst1.8          {d2}, [r0,:64], r1
1289
        vst1.8          {d3}, [r0,:64], r1
1290
        subs            r12, r12, #2
1291
        bne             2b
1292
1293
        add             sp,  sp,  #168+16
1294
        pop             {r4,pc}
1295
endfunc
1296
1297
function ff_put_vp8_epel4_v6_neon, export=1
1298
        sub             r2,  r2,  r3,  lsl #1
1299
        push            {r4,lr}
1300
1301
        ldr             r4,  [sp, #16]          @ my
1302
        movrel          lr,  subpel_filters-16
1303
        ldr             r12, [sp, #8]           @ h
1304
        add             r4,  lr,  r4, lsl #4
1305
        vld1.16         {q0},     [r4,:128]
1306
1:
1307
        vld1.32         {d2[]},   [r2], r3
1308
        vld1.32         {d3[]},   [r2], r3
1309
        vld1.32         {d4[]},   [r2], r3
1310
        vld1.32         {d5[]},   [r2], r3
1311
        vld1.32         {d6[]},   [r2], r3
1312
        vld1.32         {d7[]},   [r2], r3
1313
        vld1.32         {d28[]},  [r2]
1314
        sub             r2,  r2,  r3,  lsl #2
1315
        vld1.32         {d2[1]},  [r2], r3
1316
        vld1.32         {d3[1]},  [r2], r3
1317
        vld1.32         {d4[1]},  [r2], r3
1318
        vld1.32         {d5[1]},  [r2], r3
1319
        vld1.32         {d6[1]},  [r2], r3
1320
        vld1.32         {d7[1]},  [r2], r3
1321
        vld1.32         {d28[1]}, [r2]
1322
        sub             r2,  r2,  r3,  lsl #2
1323
1324
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1325
1326
        vst1.32         {d2[0]},  [r0,:32], r1
1327
        vst1.32         {d3[0]},  [r0,:32], r1
1328
        vst1.32         {d2[1]},  [r0,:32], r1
1329
        vst1.32         {d3[1]},  [r0,:32], r1
1330
        subs            r12, r12, #4
1331
        bne             1b
1332
1333
        pop             {r4,pc}
1334
endfunc
1335
1336
function ff_put_vp8_epel4_h6_neon, export=1
1337
        sub             r2,  r2,  #2
1338
        push            {r4,lr}
1339
1340
        ldr             r4,  [sp, #12]          @ mx
1341
        movrel          lr,  subpel_filters-16
1342
        ldr             r12, [sp, #8]           @ h
1343
        add             r4,  lr,  r4, lsl #4
1344
        vld1.16         {q0},     [r4,:128]
1345
1:
1346
        vld1.8          {q1},     [r2], r3
1347
        vp8_epel8_h6    d2,  d2,  d3
1348
        vst1.32         {d2[0]},  [r0,:32], r1
1349
        subs            r12, r12, #1
1350
        bne             1b
1351
1352
        pop             {r4,pc}
1353
endfunc
1354
1355
function ff_put_vp8_epel4_h6v6_neon, export=1
1356
        sub             r2,  r2,  r3,  lsl #1
1357
        sub             r2,  r2,  #2
1358
        push            {r4,lr}
1359
1360
        ldr             r4,  [sp, #12]          @ mx
1361
        movrel          lr,  subpel_filters-16
1362
        ldr             r12, [sp, #8]           @ h
1363
        add             r4,  lr,  r4, lsl #4
1364
        sub             sp,  sp,  #52+16
1365
        vld1.16         {q0},     [r4,:128]
1366
        add             lr,  sp,  #15
1367
        add             r12, r12, #5
1368
        bic             lr,  lr,  #15
1369
1:
1370
        vld1.8          {q1},     [r2], r3
1371
        vp8_epel8_h6    d2,  d2,  d3
1372
        vst1.32         {d2[0]},  [lr,:32]!
1373
        subs            r12, r12, #1
1374
        bne             1b
1375
1376
        ldr             r4,  [sp, #52+16+16]    @ my
1377
        movrel          lr,  subpel_filters-16
1378
        ldr             r12, [sp, #52+16+8]     @ h
1379
        add             r4,  lr,  r4, lsl #4
1380
        add             lr,  sp,  #15
1381
        vld1.16         {q0},     [r4,:128]
1382
        bic             lr,  lr,  #15
1383
2:
1384
        vld1.8          {d2-d3},  [lr,:128]!
1385
        vld1.8          {d6},     [lr,:64]!
1386
        vld1.32         {d28[]},  [lr,:32]
1387
        sub             lr,  lr,  #16
1388
        vld1.8          {d4-d5},  [lr]!
1389
        vld1.8          {d7},     [lr,:64]!
1390
        vld1.32         {d28[1]}, [lr,:32]
1391
        sub             lr,  lr,  #16
1392
        vtrn.32         q1,  q2
1393
        vtrn.32         d6,  d7
1394
        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1395
        vst1.32         {d2[0]},  [r0,:32], r1
1396
        vst1.32         {d3[0]},  [r0,:32], r1
1397
        vst1.32         {d2[1]},  [r0,:32], r1
1398
        vst1.32         {d3[1]},  [r0,:32], r1
1399
        subs            r12, r12, #4
1400
        bne             2b
1401
1402
        add             sp,  sp,  #52+16
1403
        pop             {r4,pc}
1404
endfunc
1405
1406
function ff_put_vp8_epel4_h4v6_neon, export=1
1407
        sub             r2,  r2,  r3,  lsl #1
1408
        sub             r2,  r2,  #1
1409
        push            {r4,lr}
1410
1411
        ldr             r4,  [sp, #12]          @ mx
1412
        movrel          lr,  subpel_filters-16
1413
        ldr             r12, [sp, #8]           @ h
1414
        add             r4,  lr,  r4, lsl #4
1415
        sub             sp,  sp,  #52+16
1416
        vld1.16         {q0},     [r4,:128]
1417
        add             lr,  sp,  #15
1418
        add             r12, r12, #5
1419
        bic             lr,  lr,  #15
1420
1:
1421
        vld1.8          {d2},     [r2], r3
1422
        vp8_epel8_h4    d2,  d2,  d2
1423
        vst1.32         {d2[0]},  [lr,:32]!
1424
        subs            r12, r12, #1
1425
        bne             1b
1426
1427
        ldr             r4,  [sp, #52+16+16]    @ my
1428
        movrel          lr,  subpel_filters-16
1429
        ldr             r12, [sp, #52+16+8]     @ h
1430
        add             r4,  lr,  r4, lsl #4
1431
        add             lr,  sp,  #15
1432
        vld1.16         {q0},     [r4,:128]
1433
        bic             lr,  lr,  #15
1434
2:
1435
        vld1.8          {d2-d3},  [lr,:128]!
1436
        vld1.8          {d6},     [lr,:64]!
1437
        vld1.32         {d28[]},  [lr,:32]
1438
        sub             lr,  lr,  #16
1439
        vld1.8          {d4-d5},  [lr]!
1440
        vld1.8          {d7},     [lr,:64]!
1441
        vld1.32         {d28[1]}, [lr,:32]
1442
        sub             lr,  lr,  #16
1443
        vtrn.32         q1,  q2
1444
        vtrn.32         d6,  d7
1445
        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1446
        vst1.32         {d2[0]},  [r0,:32], r1
1447
        vst1.32         {d3[0]},  [r0,:32], r1
1448
        vst1.32         {d2[1]},  [r0,:32], r1
1449
        vst1.32         {d3[1]},  [r0,:32], r1
1450
        subs            r12, r12, #4
1451
        bne             2b
1452
1453
        add             sp,  sp,  #52+16
1454
        pop             {r4,pc}
1455
endfunc
1456
1457
function ff_put_vp8_epel4_h6v4_neon, export=1
1458
        sub             r2,  r2,  r3
1459
        sub             r2,  r2,  #2
1460
        push            {r4,lr}
1461
1462
        ldr             r4,  [sp, #12]          @ mx
1463
        movrel          lr,  subpel_filters-16
1464
        ldr             r12, [sp, #8]           @ h
1465
        add             r4,  lr,  r4, lsl #4
1466
        sub             sp,  sp,  #44+16
1467
        vld1.16         {q0},     [r4,:128]
1468
        add             lr,  sp,  #15
1469
        add             r12, r12, #3
1470
        bic             lr,  lr,  #15
1471
1:
1472
        vld1.8          {q1},     [r2], r3
1473
        vp8_epel8_h6    d2,  d2,  d3
1474
        vst1.32         {d2[0]},  [lr,:32]!
1475
        subs            r12, r12, #1
1476
        bne             1b
1477
1478
        ldr             r4,  [sp, #44+16+16]    @ my
1479
        movrel          lr,  subpel_filters-16
1480
        ldr             r12, [sp, #44+16+8]     @ h
1481
        add             r4,  lr,  r4, lsl #4
1482
        add             lr,  sp,  #15
1483
        vld1.16         {q0},     [r4,:128]
1484
        bic             lr,  lr,  #15
1485
2:
1486
        vld1.8          {d2-d3},  [lr,:128]!
1487
        vld1.32         {d6[]},   [lr,:32]
1488
        sub             lr,  lr,  #8
1489
        vld1.8          {d4-d5},  [lr]!
1490
        vld1.32         {d6[1]},  [lr,:32]
1491
        sub             lr,  lr,  #8
1492
        vtrn.32         q1,  q2
1493
        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1494
        vst1.32         {d2[0]},  [r0,:32], r1
1495
        vst1.32         {d3[0]},  [r0,:32], r1
1496
        vst1.32         {d2[1]},  [r0,:32], r1
1497
        vst1.32         {d3[1]},  [r0,:32], r1
1498
        subs            r12, r12, #4
1499
        bne             2b
1500
1501
        add             sp,  sp,  #44+16
1502
        pop             {r4,pc}
1503
endfunc
1504
1505
function ff_put_vp8_epel4_h4_neon, export=1
1506
        sub             r2,  r2,  #1
1507
        push            {r4,lr}
1508
1509
        ldr             r4,  [sp, #12]          @ mx
1510
        movrel          lr,  subpel_filters-16
1511
        ldr             r12, [sp, #8]           @ h
1512
        add             r4,  lr,  r4, lsl #4
1513
        vld1.16         {q0},     [r4,:128]
1514
1:
1515
        vld1.8          {d2},     [r2], r3
1516
        vp8_epel8_h4    d2,  d2,  d2
1517
        vst1.32         {d2[0]},  [r0,:32], r1
1518
        subs            r12, r12, #1
1519
        bne             1b
1520
1521
        pop             {r4,pc}
1522
endfunc
1523
1524
function ff_put_vp8_epel4_v4_neon, export=1
1525
        sub             r2,  r2,  r3
1526
        push            {r4,lr}
1527
1528
        ldr             r4,  [sp, #16]          @ my
1529
        movrel          lr,  subpel_filters-16
1530
        ldr             r12, [sp, #8]           @ h
1531
        add             r4,  lr,  r4, lsl #4
1532
        vld1.16         {q0},     [r4,:128]
1533
1:
1534
        vld1.32         {d2[]},   [r2], r3
1535
        vld1.32         {d3[]},   [r2], r3
1536
        vld1.32         {d4[]},   [r2], r3
1537
        vld1.32         {d5[]},   [r2], r3
1538
        vld1.32         {d6[]},   [r2]
1539
        sub             r2,  r2,  r3,  lsl #1
1540
        vld1.32         {d2[1]},  [r2], r3
1541
        vld1.32         {d3[1]},  [r2], r3
1542
        vld1.32         {d4[1]},  [r2], r3
1543
        vld1.32         {d5[1]},  [r2], r3
1544
        vld1.32         {d6[1]},  [r2]
1545
        sub             r2,  r2,  r3,  lsl #1
1546
1547
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1548
1549
        vst1.32         {d2[0]},  [r0,:32], r1
1550
        vst1.32         {d3[0]},  [r0,:32], r1
1551
        vst1.32         {d2[1]},  [r0,:32], r1
1552
        vst1.32         {d3[1]},  [r0,:32], r1
1553
        subs            r12, r12, #4
1554
        bne             1b
1555
1556
        pop             {r4,pc}
1557
endfunc
1558
1559
function ff_put_vp8_epel4_h4v4_neon, export=1
1560
        sub             r2,  r2,  r3
1561
        sub             r2,  r2,  #1
1562
        push            {r4,lr}
1563
1564
        ldr             r4,  [sp, #12]          @ mx
1565
        movrel          lr,  subpel_filters-16
1566
        ldr             r12, [sp, #8]           @ h
1567
        add             r4,  lr,  r4, lsl #4
1568
        sub             sp,  sp,  #44+16
1569
        vld1.16         {q0},     [r4,:128]
1570
        add             lr,  sp,  #15
1571
        add             r12, r12, #3
1572
        bic             lr,  lr,  #15
1573
1:
1574
        vld1.8          {d2},     [r2], r3
1575
        vp8_epel8_h4    d2,  d2,  d3
1576
        vst1.32         {d2[0]},  [lr,:32]!
1577
        subs            r12, r12, #1
1578
        bne             1b
1579
1580
        ldr             r4,  [sp, #44+16+16]    @ my
1581
        movrel          lr,  subpel_filters-16
1582
        ldr             r12, [sp, #44+16+8]     @ h
1583
        add             r4,  lr,  r4, lsl #4
1584
        add             lr,  sp,  #15
1585
        vld1.16         {q0},     [r4,:128]
1586
        bic             lr,  lr,  #15
1587
2:
1588
        vld1.8          {d2-d3},  [lr,:128]!
1589
        vld1.32         {d6[]},   [lr,:32]
1590
        sub             lr,  lr,  #8
1591
        vld1.8          {d4-d5},  [lr]!
1592
        vld1.32         {d6[1]},  [lr,:32]
1593
        sub             lr,  lr,  #8
1594
        vtrn.32         q1,  q2
1595
        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1596
        vst1.32         {d2[0]},  [r0,:32], r1
1597
        vst1.32         {d3[0]},  [r0,:32], r1
1598
        vst1.32         {d2[1]},  [r0,:32], r1
1599
        vst1.32         {d3[1]},  [r0,:32], r1
1600
        subs            r12, r12, #4
1601
        bne             2b
1602
1603
        add             sp,  sp,  #44+16
1604
        pop             {r4,pc}
1605
endfunc
1606
1607
@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1608
@ arithmatic can be used to apply filters
1609
const   subpel_filters, align=4
1610
        .short     0,   6, 123,  12,   1,   0,   0,   0
1611
        .short     2,  11, 108,  36,   8,   1,   0,   0
1612
        .short     0,   9,  93,  50,   6,   0,   0,   0
1613
        .short     3,  16,  77,  77,  16,   3,   0,   0
1614
        .short     0,   6,  50,  93,   9,   0,   0,   0
1615
        .short     1,   8,  36, 108,  11,   2,   0,   0
1616
        .short     0,   1,  12, 123,   6,   0,   0,   0
1617
endconst
1618
1619
/* Bilinear MC */
1620
1621
function ff_put_vp8_bilin16_h_neon, export=1
1622
        ldr             r3,  [sp, #4]           @ mx
1623
        rsb             r12, r3,  #8
1624
        vdup.8          d0,  r3
1625
        vdup.8          d1,  r12
1626
        ldr             r12, [sp]               @ h
1627
1:
1628
        subs            r12, r12, #2
1629
        vld1.8          {d2-d4},  [r2], r1
1630
        vext.8          q2,  q1,  q2,  #1
1631
        vmull.u8        q8,  d2,  d1
1632
        vmlal.u8        q8,  d4,  d0
1633
        vld1.8          {d18-d20},[r2], r1
1634
        vmull.u8        q3,  d3,  d1
1635
        vmlal.u8        q3,  d5,  d0
1636
        vext.8          q10, q9,  q10, #1
1637
        vmull.u8        q11, d18, d1
1638
        vmlal.u8        q11, d20, d0
1639
        vmull.u8        q12, d19, d1
1640
        vmlal.u8        q12, d21, d0
1641
        vrshrn.u16      d4,  q8,  #3
1642
        vrshrn.u16      d5,  q3,  #3
1643
        vrshrn.u16      d6,  q11, #3
1644
        vrshrn.u16      d7,  q12, #3
1645
        vst1.8          {q2},     [r0,:128], r1
1646
        vst1.8          {q3},     [r0,:128], r1
1647
        bgt             1b
1648
1649
        bx              lr
1650
endfunc
1651
1652
function ff_put_vp8_bilin16_v_neon, export=1
1653
        ldr             r3,  [sp, #8]           @ my
1654
        rsb             r12, r3,  #8
1655
        vdup.8          d0,  r3
1656
        vdup.8          d1,  r12
1657
        ldr             r12, [sp]               @ h
1658
        vld1.8          {q1},     [r2], r1
1659
1:
1660
        subs            r12, r12, #2
1661
        vld1.8          {q2},     [r2], r1
1662
        vmull.u8        q3,  d2,  d1
1663
        vmlal.u8        q3,  d4,  d0
1664
        vmull.u8        q8,  d3,  d1
1665
        vmlal.u8        q8,  d5,  d0
1666
        vld1.8          {q1},     [r2], r1
1667
        vmull.u8        q9,  d4,  d1
1668
        vmlal.u8        q9,  d2,  d0
1669
        vmull.u8        q10, d5,  d1
1670
        vmlal.u8        q10, d3,  d0
1671
        vrshrn.u16      d4,  q3,  #3
1672
        vrshrn.u16      d5,  q8,  #3
1673
        vrshrn.u16      d6,  q9,  #3
1674
        vrshrn.u16      d7,  q10, #3
1675
        vst1.8          {q2},     [r0,:128], r1
1676
        vst1.8          {q3},     [r0,:128], r1
1677
        bgt             1b
1678
1679
        bx              lr
1680
endfunc
1681
1682
function ff_put_vp8_bilin16_hv_neon, export=1
1683
        ldr             r3,  [sp, #4]           @ mx
1684
        rsb             r12, r3,  #8
1685
        vdup.8          d0,  r3
1686
        vdup.8          d1,  r12
1687
        ldr             r3,  [sp, #8]           @ my
1688
        rsb             r12, r3,  #8
1689
        vdup.8          d2,  r3
1690
        vdup.8          d3,  r12
1691
        ldr             r12, [sp]               @ h
1692
1693
        vld1.8          {d4-d6},  [r2], r1
1694
        vext.8          q3,  q2,  q3,  #1
1695
        vmull.u8        q8,  d4,  d1
1696
        vmlal.u8        q8,  d6,  d0
1697
        vmull.u8        q9,  d5,  d1
1698
        vmlal.u8        q9,  d7,  d0
1699
        vrshrn.u16      d4,  q8,  #3
1700
        vrshrn.u16      d5,  q9,  #3
1701
1:
1702
        subs            r12, r12, #2
1703
        vld1.8          {d18-d20},[r2], r1
1704
        vext.8          q10, q9,  q10, #1
1705
        vmull.u8        q11, d18, d1
1706
        vmlal.u8        q11, d20, d0
1707
        vld1.8          {d26-d28},[r2], r1
1708
        vmull.u8        q12, d19, d1
1709
        vmlal.u8        q12, d21, d0
1710
        vext.8          q14, q13, q14, #1
1711
        vmull.u8        q8,  d26, d1
1712
        vmlal.u8        q8,  d28, d0
1713
        vmull.u8        q9,  d27, d1
1714
        vmlal.u8        q9,  d29, d0
1715
        vrshrn.u16      d6,  q11, #3
1716
        vrshrn.u16      d7,  q12, #3
1717
        vmull.u8        q12, d4,  d3
1718
        vmlal.u8        q12, d6,  d2
1719
        vmull.u8        q15, d5,  d3
1720
        vmlal.u8        q15, d7,  d2
1721
        vrshrn.u16      d4,  q8,  #3
1722
        vrshrn.u16      d5,  q9,  #3
1723
        vmull.u8        q10, d6,  d3
1724
        vmlal.u8        q10, d4,  d2
1725
        vmull.u8        q11, d7,  d3
1726
        vmlal.u8        q11, d5,  d2
1727
        vrshrn.u16      d24, q12, #3
1728
        vrshrn.u16      d25, q15, #3
1729
        vst1.8          {q12},    [r0,:128], r1
1730
        vrshrn.u16      d20, q10, #3
1731
        vrshrn.u16      d21, q11, #3
1732
        vst1.8          {q10},    [r0,:128], r1
1733
        bgt             1b
1734
1735
        bx              lr
1736
endfunc
1737
1738
function ff_put_vp8_bilin8_h_neon, export=1
1739
        ldr             r3,  [sp, #4]           @ mx
1740
        rsb             r12, r3,  #8
1741
        vdup.8          d0,  r3
1742
        vdup.8          d1,  r12
1743
        ldr             r12, [sp]               @ h
1744
1:
1745
        subs            r12, r12, #2
1746
        vld1.8          {q1},     [r2], r1
1747
        vext.8          d3,  d2,  d3,  #1
1748
        vmull.u8        q2,  d2,  d1
1749
        vmlal.u8        q2,  d3,  d0
1750
        vld1.8          {q3},     [r2], r1
1751
        vext.8          d7,  d6,  d7,  #1
1752
        vmull.u8        q8,  d6,  d1
1753
        vmlal.u8        q8,  d7,  d0
1754
        vrshrn.u16      d4,  q2,  #3
1755
        vrshrn.u16      d16, q8,  #3
1756
        vst1.8          {d4},     [r0,:64], r1
1757
        vst1.8          {d16},    [r0,:64], r1
1758
        bgt             1b
1759
1760
        bx              lr
1761
endfunc
1762
1763
function ff_put_vp8_bilin8_v_neon, export=1
1764
        ldr             r3,  [sp, #8]           @ my
1765
        rsb             r12, r3,  #8
1766
        vdup.8          d0,  r3
1767
        vdup.8          d1,  r12
1768
        ldr             r12, [sp]               @ h
1769
        vld1.8          {d2},     [r2], r1
1770
1:
1771
        subs            r12, r12, #2
1772
        vld1.8          {d3},     [r2], r1
1773
        vmull.u8        q2,  d2,  d1
1774
        vmlal.u8        q2,  d3,  d0
1775
        vld1.8          {d2},     [r2], r1
1776
        vmull.u8        q3,  d3,  d1
1777
        vmlal.u8        q3,  d2,  d0
1778
        vrshrn.u16      d4,  q2,  #3
1779
        vrshrn.u16      d6,  q3,  #3
1780
        vst1.8          {d4},     [r0,:64], r1
1781
        vst1.8          {d6},     [r0,:64], r1
1782
        bgt             1b
1783
1784
        bx              lr
1785
endfunc
1786
1787
function ff_put_vp8_bilin8_hv_neon, export=1
1788
        ldr             r3,  [sp, #4]           @ mx
1789
        rsb             r12, r3,  #8
1790
        vdup.8          d0,  r3
1791
        vdup.8          d1,  r12
1792
        ldr             r3,  [sp, #8]           @ my
1793
        rsb             r12, r3,  #8
1794
        vdup.8          d2,  r3
1795
        vdup.8          d3,  r12
1796
        ldr             r12, [sp]               @ h
1797
1798
        vld1.8          {q2},     [r2], r1
1799
        vext.8          d5,  d4,  d5,  #1
1800
        vmull.u8        q9,  d4,  d1
1801
        vmlal.u8        q9,  d5,  d0
1802
        vrshrn.u16      d22, q9,  #3
1803
1:
1804
        subs            r12, r12, #2
1805
        vld1.8          {q3},     [r2], r1
1806
        vext.8          d7,  d6,  d7,  #1
1807
        vmull.u8        q8,  d6,  d1
1808
        vmlal.u8        q8,  d7,  d0
1809
        vld1.8          {q2},     [r2], r1
1810
        vext.8          d5,  d4,  d5,  #1
1811
        vmull.u8        q9,  d4,  d1
1812
        vmlal.u8        q9,  d5,  d0
1813
        vrshrn.u16      d16, q8,  #3
1814
        vmull.u8        q10, d22, d3
1815
        vmlal.u8        q10, d16, d2
1816
        vrshrn.u16      d22, q9,  #3
1817
        vmull.u8        q12, d16, d3
1818
        vmlal.u8        q12, d22, d2
1819
        vrshrn.u16      d20, q10, #3
1820
        vst1.8          {d20},    [r0,:64], r1
1821
        vrshrn.u16      d23, q12, #3
1822
        vst1.8          {d23},    [r0,:64], r1
1823
        bgt             1b
1824
1825
        bx              lr
1826
endfunc
1827
1828
function ff_put_vp8_bilin4_h_neon, export=1
1829
        ldr             r3,  [sp, #4]           @ mx
1830
        rsb             r12, r3,  #8
1831
        vdup.8          d0,  r3
1832
        vdup.8          d1,  r12
1833
        ldr             r12, [sp]               @ h
1834
1:
1835
        subs            r12, r12, #2
1836
        vld1.8          {d2},     [r2], r1
1837
        vext.8          d3,  d2,  d3,  #1
1838
        vld1.8          {d6},     [r2], r1
1839
        vext.8          d7,  d6,  d7,  #1
1840
        vtrn.32         q1,  q3
1841
        vmull.u8        q2,  d2,  d1
1842
        vmlal.u8        q2,  d3,  d0
1843
        vrshrn.u16      d4,  q2,  #3
1844
        vst1.32         {d4[0]},  [r0,:32], r1
1845
        vst1.32         {d4[1]}, [r0,:32], r1
1846
        bgt             1b
1847
1848
        bx              lr
1849
endfunc
1850
1851
function ff_put_vp8_bilin4_v_neon, export=1
1852
        ldr             r3,  [sp, #8]           @ my
1853
        rsb             r12, r3,  #8
1854
        vdup.8          d0,  r3
1855
        vdup.8          d1,  r12
1856
        ldr             r12, [sp]               @ h
1857
        vld1.32         {d2[]},   [r2], r1
1858
1:
1859
        vld1.32         {d3[]},   [r2]
1860
        vld1.32         {d2[1]},  [r2], r1
1861
        vld1.32         {d3[1]},  [r2], r1
1862
        vmull.u8        q2,  d2,  d1
1863
        vmlal.u8        q2,  d3,  d0
1864
        vtrn.32         d3,  d2
1865
        vrshrn.u16      d4,  q2,  #3
1866
        vst1.32         {d4[0]},  [r0,:32], r1
1867
        vst1.32         {d4[1]},  [r0,:32], r1
1868
        subs            r12, r12, #2
1869
        bgt             1b
1870
1871
        bx              lr
1872
endfunc
1873
1874
function ff_put_vp8_bilin4_hv_neon, export=1
1875
        ldr             r3,  [sp, #4]           @ mx
1876
        rsb             r12, r3,  #8
1877
        vdup.8          d0,  r3
1878
        vdup.8          d1,  r12
1879
        ldr             r3,  [sp, #8]           @ my
1880
        rsb             r12, r3,  #8
1881
        vdup.8          d2,  r3
1882
        vdup.8          d3,  r12
1883
        ldr             r12, [sp]               @ h
1884
1885
        vld1.8          {d4},     [r2], r1
1886
        vext.8          d5,  d4,  d4,  #1
1887
        vmull.u8        q9,  d4,  d1
1888
        vmlal.u8        q9,  d5,  d0
1889
        vrshrn.u16      d22, q9,  #3
1890
1:
1891
        subs            r12, r12, #2
1892
        vld1.8          {d6},     [r2], r1
1893
        vext.8          d7,  d6,  d6,  #1
1894
        vld1.8          {d4},     [r2], r1
1895
        vext.8          d5,  d4,  d4,  #1
1896
        vtrn.32         q3,  q2
1897
        vmull.u8        q8,  d6,  d1
1898
        vmlal.u8        q8,  d7,  d0
1899
        vrshrn.u16      d16, q8,  #3
1900
        vmull.u8        q10, d16, d2
1901
        vtrn.32         d22, d16
1902
        vmlal.u8        q10, d22, d3
1903
        vrev64.32       d22, d16
1904
        vrshrn.u16      d20, q10, #3
1905
        vst1.32         {d20[0]}, [r0,:32], r1
1906
        vst1.32         {d20[1]}, [r0,:32], r1
1907
        bgt             1b
1908
1909
        bx              lr
1910
endfunc