Statistics
| Branch: | Revision:

ffmpeg / libavcodec / arm / vp8dsp_neon.S @ 8b454c35

History | View | Annotate | Download (66.4 KB)

1
/**
2
 * VP8 NEON optimisations
3
 *
4
 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5
 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6
 *
7
 * This file is part of FFmpeg.
8
 *
9
 * FFmpeg is free software; you can redistribute it and/or
10
 * modify it under the terms of the GNU Lesser General Public
11
 * License as published by the Free Software Foundation; either
12
 * version 2.1 of the License, or (at your option) any later version.
13
 *
14
 * FFmpeg is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17
 * Lesser General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU Lesser General Public
20
 * License along with FFmpeg; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22
 */
23

    
24
#include "asm.S"
25

    
26
function ff_vp8_luma_dc_wht_neon, export=1
27
        vld1.16         {q0-q1},  [r1,:128]
28
        vmov.i16        q15, #0
29

    
30
        vadd.i16        d4,  d0,  d3
31
        vadd.i16        d6,  d1,  d2
32
        vst1.16         {q15},    [r1,:128]!
33
        vsub.i16        d7,  d1,  d2
34
        vsub.i16        d5,  d0,  d3
35
        vst1.16         {q15},    [r1,:128]
36
        vadd.i16        q0,  q2,  q3
37
        vsub.i16        q1,  q2,  q3
38

    
39
        vmov.i16        q8, #3
40

    
41
        vtrn.32         d0,  d2
42
        vtrn.32         d1,  d3
43
        vtrn.16         d0,  d1
44
        vtrn.16         d2,  d3
45

    
46
        vadd.i16        d0,  d0,  d16
47

    
48
        vadd.i16        d4,  d0,  d3
49
        vadd.i16        d6,  d1,  d2
50
        vsub.i16        d7,  d1,  d2
51
        vsub.i16        d5,  d0,  d3
52
        vadd.i16        q0,  q2,  q3
53
        vsub.i16        q1,  q2,  q3
54

    
55
        vshr.s16        q0,  q0,  #3
56
        vshr.s16        q1,  q1,  #3
57

    
58
        mov             r3,  #32
59
        vst1.16         {d0[0]},  [r0,:16], r3
60
        vst1.16         {d1[0]},  [r0,:16], r3
61
        vst1.16         {d2[0]},  [r0,:16], r3
62
        vst1.16         {d3[0]},  [r0,:16], r3
63
        vst1.16         {d0[1]},  [r0,:16], r3
64
        vst1.16         {d1[1]},  [r0,:16], r3
65
        vst1.16         {d2[1]},  [r0,:16], r3
66
        vst1.16         {d3[1]},  [r0,:16], r3
67
        vst1.16         {d0[2]},  [r0,:16], r3
68
        vst1.16         {d1[2]},  [r0,:16], r3
69
        vst1.16         {d2[2]},  [r0,:16], r3
70
        vst1.16         {d3[2]},  [r0,:16], r3
71
        vst1.16         {d0[3]},  [r0,:16], r3
72
        vst1.16         {d1[3]},  [r0,:16], r3
73
        vst1.16         {d2[3]},  [r0,:16], r3
74
        vst1.16         {d3[3]},  [r0,:16], r3
75

    
76
        bx              lr
77
endfunc
78

    
79
function ff_vp8_luma_dc_wht_dc_neon, export=1
80
        ldrsh           r2,  [r1]
81
        mov             r3,  #0
82
        add             r2,  r2,  #3
83
        strh            r3,  [r1]
84
        asr             r2,  r2,  #3
85
    .rept 16
86
        strh            r2,  [r0], #32
87
    .endr
88
        bx              lr
89
endfunc
90

    
91
function ff_vp8_idct_add_neon, export=1
92
        vld1.16         {q0-q1},  [r1,:128]
93
        movw            r3,  #20091
94
        movt            r3,  #35468/2
95
        vdup.32         d4,  r3
96

    
97
        vmull.s16       q12, d1,  d4[0]
98
        vmull.s16       q13, d3,  d4[0]
99
        vqdmulh.s16     d20, d1,  d4[1]
100
        vqdmulh.s16     d23, d3,  d4[1]
101
        vshrn.s32       d21, q12, #16
102
        vshrn.s32       d22, q13, #16
103
        vadd.s16        d21, d21, d1
104
        vadd.s16        d22, d22, d3
105

    
106
        vadd.s16        d16, d0,  d2
107
        vsub.s16        d17, d0,  d2
108
        vadd.s16        d18, d21, d23
109
        vsub.s16        d19, d20, d22
110
        vadd.s16        q0,  q8,  q9
111
        vsub.s16        q1,  q8,  q9
112

    
113
        vtrn.32         d0,  d3
114
        vtrn.32         d1,  d2
115
        vtrn.16         d0,  d1
116
        vtrn.16         d3,  d2
117

    
118
        vmov.i16        q15, #0
119
        vmull.s16       q12, d1,  d4[0]
120
        vst1.16         {q15},    [r1,:128]!
121
        vmull.s16       q13, d2,  d4[0]
122
        vst1.16         {q15},    [r1,:128]
123
        vqdmulh.s16     d21, d1,  d4[1]
124
        vqdmulh.s16     d23, d2,  d4[1]
125
        vshrn.s32       d20, q12, #16
126
        vshrn.s32       d22, q13, #16
127
        vadd.i16        d20, d20, d1
128
        vadd.i16        d22, d22, d2
129

    
130
        vadd.i16        d16, d0,  d3
131
        vsub.i16        d17, d0,  d3
132
        vadd.i16        d18, d20, d23
133
        vld1.32         {d20[]},  [r0,:32], r2
134
        vsub.i16        d19, d21, d22
135
        vld1.32         {d22[]},  [r0,:32], r2
136
        vadd.s16        q0,  q8,  q9
137
        vld1.32         {d23[]},  [r0,:32], r2
138
        vsub.s16        q1,  q8,  q9
139
        vld1.32         {d21[]},  [r0,:32], r2
140
        vrshr.s16       q0,  q0,  #3
141
        vtrn.32         q10, q11
142
        vrshr.s16       q1,  q1,  #3
143

    
144
        sub             r0,  r0,  r2,  lsl #2
145

    
146
        vtrn.32         d0,  d3
147
        vtrn.32         d1,  d2
148
        vtrn.16         d0,  d1
149
        vtrn.16         d3,  d2
150

    
151
        vaddw.u8        q0,  q0,  d20
152
        vaddw.u8        q1,  q1,  d21
153
        vqmovun.s16     d0,  q0
154
        vqmovun.s16     d1,  q1
155

    
156
        vst1.32         {d0[0]},  [r0,:32], r2
157
        vst1.32         {d0[1]},  [r0,:32], r2
158
        vst1.32         {d1[1]},  [r0,:32], r2
159
        vst1.32         {d1[0]},  [r0,:32], r2
160

    
161
        bx              lr
162
endfunc
163

    
164
function ff_vp8_idct_dc_add_neon, export=1
165
        mov             r3,  #0
166
        ldrsh           r12, [r1]
167
        strh            r3,  [r1]
168
        vdup.16         q1,  r12
169
        vrshr.s16       q1,  q1,  #3
170
        vld1.32         {d0[]},   [r0,:32], r2
171
        vld1.32         {d1[]},   [r0,:32], r2
172
        vld1.32         {d0[1]},  [r0,:32], r2
173
        vld1.32         {d1[1]},  [r0,:32], r2
174
        vaddw.u8        q2,  q1,  d0
175
        vaddw.u8        q3,  q1,  d1
176
        sub             r0,  r0,  r2, lsl #2
177
        vqmovun.s16     d0,  q2
178
        vqmovun.s16     d1,  q3
179
        vst1.32         {d0[0]},  [r0,:32], r2
180
        vst1.32         {d1[0]},  [r0,:32], r2
181
        vst1.32         {d0[1]},  [r0,:32], r2
182
        vst1.32         {d1[1]},  [r0,:32], r2
183
        bx              lr
184
endfunc
185

    
186
function ff_vp8_idct_dc_add4uv_neon, export=1
187
        vmov.i16        d0,  #0
188
        mov             r3,  #32
189
        vld1.16         {d16[]},  [r1,:16]
190
        vst1.16         {d0[0]},  [r1,:16], r3
191
        vld1.16         {d17[]},  [r1,:16]
192
        vst1.16         {d0[0]},  [r1,:16], r3
193
        vld1.16         {d18[]},  [r1,:16]
194
        vst1.16         {d0[0]},  [r1,:16], r3
195
        vld1.16         {d19[]},  [r1,:16]
196
        vst1.16         {d0[0]},  [r1,:16], r3
197
        mov             r3,  r0
198
        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
199
        vld1.8          {d0},     [r0,:64], r2
200
        vrshr.s16       q9,  q9,  #3
201
        vld1.8          {d1},     [r0,:64], r2
202
        vaddw.u8        q10, q8,  d0
203
        vld1.8          {d2},     [r0,:64], r2
204
        vaddw.u8        q0,  q8,  d1
205
        vld1.8          {d3},     [r0,:64], r2
206
        vaddw.u8        q11, q8,  d2
207
        vld1.8          {d4},     [r0,:64], r2
208
        vaddw.u8        q1,  q8,  d3
209
        vld1.8          {d5},     [r0,:64], r2
210
        vaddw.u8        q12, q9,  d4
211
        vld1.8          {d6},     [r0,:64], r2
212
        vaddw.u8        q2,  q9,  d5
213
        vld1.8          {d7},     [r0,:64], r2
214
        vaddw.u8        q13, q9,  d6
215
        vqmovun.s16     d20, q10
216
        vaddw.u8        q3,  q9,  d7
217
        vqmovun.s16     d21, q0
218
        vqmovun.s16     d22, q11
219
        vst1.8          {d20},    [r3,:64], r2
220
        vqmovun.s16     d23, q1
221
        vst1.8          {d21},    [r3,:64], r2
222
        vqmovun.s16     d24, q12
223
        vst1.8          {d22},    [r3,:64], r2
224
        vqmovun.s16     d25, q2
225
        vst1.8          {d23},    [r3,:64], r2
226
        vqmovun.s16     d26, q13
227
        vst1.8          {d24},    [r3,:64], r2
228
        vqmovun.s16     d27, q3
229
        vst1.8          {d25},    [r3,:64], r2
230
        vst1.8          {d26},    [r3,:64], r2
231
        vst1.8          {d27},    [r3,:64], r2
232

    
233
        bx              lr
234
endfunc
235

    
236
function ff_vp8_idct_dc_add4y_neon, export=1
237
        vmov.i16        d0,  #0
238
        mov             r3,  #32
239
        vld1.16         {d16[]},  [r1,:16]
240
        vst1.16         {d0[0]},  [r1,:16], r3
241
        vld1.16         {d17[]},  [r1,:16]
242
        vst1.16         {d0[0]},  [r1,:16], r3
243
        vld1.16         {d18[]},  [r1,:16]
244
        vst1.16         {d0[0]},  [r1,:16], r3
245
        vld1.16         {d19[]},  [r1,:16]
246
        vst1.16         {d0[0]},  [r1,:16], r3
247
        vrshr.s16       q8,  q8,  #3            @ dc >>= 3
248
        vld1.8          {q0},     [r0,:128], r2
249
        vrshr.s16       q9,  q9,  #3
250
        vld1.8          {q1},     [r0,:128], r2
251
        vaddw.u8        q10, q8,  d0
252
        vld1.8          {q2},     [r0,:128], r2
253
        vaddw.u8        q0,  q9,  d1
254
        vld1.8          {q3},     [r0,:128], r2
255
        vaddw.u8        q11, q8,  d2
256
        vaddw.u8        q1,  q9,  d3
257
        vaddw.u8        q12, q8,  d4
258
        vaddw.u8        q2,  q9,  d5
259
        vaddw.u8        q13, q8,  d6
260
        vaddw.u8        q3,  q9,  d7
261
        sub             r0,  r0,  r2,  lsl #2
262
        vqmovun.s16     d20, q10
263
        vqmovun.s16     d21, q0
264
        vqmovun.s16     d22, q11
265
        vqmovun.s16     d23, q1
266
        vqmovun.s16     d24, q12
267
        vst1.8          {q10},    [r0,:128], r2
268
        vqmovun.s16     d25, q2
269
        vst1.8          {q11},    [r0,:128], r2
270
        vqmovun.s16     d26, q13
271
        vst1.8          {q12},    [r0,:128], r2
272
        vqmovun.s16     d27, q3
273
        vst1.8          {q13},    [r0,:128], r2
274

    
275
        bx              lr
276
endfunc
277

    
278
@ Register layout:
279
@   P3..Q3 -> q0..q7
280
@   flim_E -> q14
281
@   flim_I -> q15
282
@   hev_thresh -> r12
283
@
284
.macro  vp8_loop_filter, inner=0, simple=0
285
    .if \simple
286
        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
287
        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
288
        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
289
        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
290
        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
291
        vmov.i8         q13, #0x80
292
        vcle.u8         q8,  q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
293
    .else
294
        @ calculate hev and normal_limit:
295
        vabd.u8         q12, q2,  q3            @ abs(P1-P0)
296
        vabd.u8         q13, q5,  q4            @ abs(Q1-Q0)
297
        vabd.u8         q10, q0,  q1            @ abs(P3-P2)
298
        vabd.u8         q11, q1,  q2            @ abs(P2-P1)
299
        vcle.u8         q8,  q12, q15           @ abs(P1-P0) <= flim_I
300
        vcle.u8         q9,  q13, q15           @ abs(Q1-Q0) <= flim_I
301
        vcle.u8         q10, q10, q15           @ abs(P3-P2) <= flim_I
302
        vcle.u8         q11, q11, q15           @ abs(P2-P1) <= flim_I
303
        vand            q8,  q8,  q9
304
        vabd.u8         q9,  q7,  q6            @ abs(Q3-Q2)
305
        vand            q8,  q8,  q11
306
        vabd.u8         q11, q6,  q5            @ abs(Q2-Q1)
307
        vand            q8,  q8,  q10
308
        vcle.u8         q10, q9,  q15           @ abs(Q3-Q2) <= flim_I
309
        vcle.u8         q11, q11, q15           @ abs(Q2-Q1) <= flim_I
310
        vabd.u8         q9,  q3,  q4            @ abs(P0-Q0)
311
        vabd.u8         q15, q2,  q5            @ abs(P1-Q1)
312
        vand            q8,  q8,  q10
313
        vqadd.u8        q9,  q9,  q9            @ abs(P0-Q0) * 2
314
        vand            q8,  q8,  q11
315
        vshr.u8         q10, q15, #1            @ abs(P1-Q1) / 2
316
        vdup.8          q15, r12                @ hev_thresh
317
        vqadd.u8        q11, q9,  q10           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
318
        vcgt.u8         q12, q12, q15           @ abs(P1-P0) > hev_thresh
319
        vcle.u8         q11, q11, q14           @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
320
        vcgt.u8         q14, q13, q15           @ abs(Q1-Q0) > hev_thresh
321
        vand            q8,  q8,  q11
322
        vmov.i8         q13, #0x80
323
        vorr            q9,  q12, q14
324
    .endif
325

    
326
        @ at this point:
327
        @   q8: normal_limit
328
        @   q9: hev
329

    
330
        @ convert to signed value:
331
        veor            q3,  q3,  q13           @ PS0 = P0 ^ 0x80
332
        veor            q4,  q4,  q13           @ QS0 = Q0 ^ 0x80
333

    
334
        vmov.i16        q12, #3
335
        vsubl.s8        q10, d8,  d6            @ QS0 - PS0
336
        vsubl.s8        q11, d9,  d7            @   (widened to 16bit)
337
        veor            q2,  q2,  q13           @ PS1 = P1 ^ 0x80
338
        veor            q5,  q5,  q13           @ QS1 = Q1 ^ 0x80
339
        vmul.i16        q10, q10, q12           @ w = 3 * (QS0 - PS0)
340
        vmul.i16        q11, q11, q12
341

    
342
        vqsub.s8        q12, q2,  q5            @ clamp(PS1-QS1)
343
        vmov.i8         q14, #4
344
        vmov.i8         q15, #3
345
    .if \inner
346
        vand            q12, q12, q9            @ if(hev) w += clamp(PS1-QS1)
347
    .endif
348
        vaddw.s8        q10, q10, d24           @ w += clamp(PS1-QS1)
349
        vaddw.s8        q11, q11, d25
350
        vqmovn.s16      d20, q10                @ narrow result back into q10
351
        vqmovn.s16      d21, q11
352
    .if !\inner && !\simple
353
        veor            q1,  q1,  q13           @ PS2 = P2 ^ 0x80
354
        veor            q6,  q6,  q13           @ QS2 = Q2 ^ 0x80
355
    .endif
356
        vand            q10, q10, q8            @ w &= normal_limit
357

    
358
        @ registers used at this point..
359
        @   q0 -> P3  (don't corrupt)
360
        @   q1-q6 -> PS2-QS2
361
        @   q7 -> Q3  (don't corrupt)
362
        @   q9 -> hev
363
        @   q10 -> w
364
        @   q13 -> #0x80
365
        @   q14 -> #4
366
        @   q15 -> #3
367
        @   q8, q11, q12 -> unused
368

    
369
        @ filter_common:   is4tap==1
370
        @   c1 = clamp(w + 4) >> 3;
371
        @   c2 = clamp(w + 3) >> 3;
372
        @   Q0 = s2u(QS0 - c1);
373
        @   P0 = s2u(PS0 + c2);
374

    
375
    .if \simple
376
        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
377
        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
378
        vshr.s8         q11, q11, #3            @ c1 >>= 3
379
        vshr.s8         q12, q12, #3            @ c2 >>= 3
380
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
381
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
382
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
383
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
384
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
385
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
386
    .elseif \inner
387
        @ the !is4tap case of filter_common, only used for inner blocks
388
        @   c3 = ((c1&~hev) + 1) >> 1;
389
        @   Q1 = s2u(QS1 - c3);
390
        @   P1 = s2u(PS1 + c3);
391
        vqadd.s8        q11, q10, q14           @ c1 = clamp((w&hev)+4)
392
        vqadd.s8        q12, q10, q15           @ c2 = clamp((w&hev)+3)
393
        vshr.s8         q11, q11, #3            @ c1 >>= 3
394
        vshr.s8         q12, q12, #3            @ c2 >>= 3
395
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
396
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
397
        vbic            q11, q11, q9            @ c1 & ~hev
398
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
399
        vrshr.s8        q11, q11, #1            @ c3 >>= 1
400
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
401
        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-c3)
402
        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+c3)
403
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
404
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
405
    .else
406
        vand            q12, q10, q9            @ w & hev
407
        vqadd.s8        q11, q12, q14           @ c1 = clamp((w&hev)+4)
408
        vqadd.s8        q12, q12, q15           @ c2 = clamp((w&hev)+3)
409
        vshr.s8         q11, q11, #3            @ c1 >>= 3
410
        vshr.s8         q12, q12, #3            @ c2 >>= 3
411
        vbic            q10, q10, q9            @ w &= ~hev
412
        vqsub.s8        q4,  q4,  q11           @ QS0 = clamp(QS0-c1)
413
        vqadd.s8        q3,  q3,  q12           @ PS0 = clamp(PS0+c2)
414

    
415
        @ filter_mbedge:
416
        @   a = clamp((27*w + 63) >> 7);
417
        @   Q0 = s2u(QS0 - a);
418
        @   P0 = s2u(PS0 + a);
419
        @   a = clamp((18*w + 63) >> 7);
420
        @   Q1 = s2u(QS1 - a);
421
        @   P1 = s2u(PS1 + a);
422
        @   a = clamp((9*w + 63) >> 7);
423
        @   Q2 = s2u(QS2 - a);
424
        @   P2 = s2u(PS2 + a);
425
        vmov.i16        q9,  #63
426
        vshll.s8        q14, d20, #3
427
        vshll.s8        q15, d21, #3
428
        vaddw.s8        q14, q14, d20
429
        vaddw.s8        q15, q15, d21
430
        vadd.s16        q8,  q9,  q14
431
        vadd.s16        q9,  q9,  q15           @  9*w + 63
432
        vadd.s16        q11, q8,  q14
433
        vadd.s16        q12, q9,  q15           @ 18*w + 63
434
        vadd.s16        q14, q11, q14
435
        vadd.s16        q15, q12, q15           @ 27*w + 63
436
        vqshrn.s16      d16, q8,  #7
437
        vqshrn.s16      d17, q9,  #7            @ clamp(( 9*w + 63)>>7)
438
        vqshrn.s16      d22, q11, #7
439
        vqshrn.s16      d23, q12, #7            @ clamp((18*w + 63)>>7)
440
        vqshrn.s16      d28, q14, #7
441
        vqshrn.s16      d29, q15, #7            @ clamp((27*w + 63)>>7)
442
        vqadd.s8        q1,  q1,  q8            @ PS2 = clamp(PS2+a)
443
        vqsub.s8        q6,  q6,  q8            @ QS2 = clamp(QS2-a)
444
        vqadd.s8        q2,  q2,  q11           @ PS1 = clamp(PS1+a)
445
        vqsub.s8        q5,  q5,  q11           @ QS1 = clamp(QS1-a)
446
        vqadd.s8        q3,  q3,  q14           @ PS0 = clamp(PS0+a)
447
        vqsub.s8        q4,  q4,  q14           @ QS0 = clamp(QS0-a)
448
        veor            q3,  q3,  q13           @ P0 = PS0 ^ 0x80
449
        veor            q4,  q4,  q13           @ Q0 = QS0 ^ 0x80
450
        veor            q2,  q2,  q13           @ P1 = PS1 ^ 0x80
451
        veor            q5,  q5,  q13           @ Q1 = QS1 ^ 0x80
452
        veor            q1,  q1,  q13           @ P2 = PS2 ^ 0x80
453
        veor            q6,  q6,  q13           @ Q2 = QS2 ^ 0x80
454
    .endif
455
.endm
456

    
457
.macro transpose8x16matrix
458
        vtrn.32         q0,   q4
459
        vtrn.32         q1,   q5
460
        vtrn.32         q2,   q6
461
        vtrn.32         q3,   q7
462

    
463
        vtrn.16         q0,   q2
464
        vtrn.16         q1,   q3
465
        vtrn.16         q4,   q6
466
        vtrn.16         q5,   q7
467

    
468
        vtrn.8          q0,   q1
469
        vtrn.8          q2,   q3
470
        vtrn.8          q4,   q5
471
        vtrn.8          q6,   q7
472
.endm
473

    
474
.macro  vp8_v_loop_filter16 name, inner=0, simple=0
475
function ff_vp8_v_loop_filter16\name\()_neon, export=1
476
        vpush           {q4-q7}
477
        sub             r0,  r0,  r1,  lsl #1+!\simple
478

    
479
        @ Load pixels:
480
    .if !\simple
481
        ldr             r12, [sp, #64]          @ hev_thresh
482
        vld1.8          {q0},     [r0,:128], r1 @ P3
483
        vld1.8          {q1},     [r0,:128], r1 @ P2
484
    .endif
485
        vld1.8          {q2},     [r0,:128], r1 @ P1
486
        vld1.8          {q3},     [r0,:128], r1 @ P0
487
        vld1.8          {q4},     [r0,:128], r1 @ Q0
488
        vld1.8          {q5},     [r0,:128], r1 @ Q1
489
    .if !\simple
490
        vld1.8          {q6},     [r0,:128], r1 @ Q2
491
        vld1.8          {q7},     [r0,:128]     @ Q3
492
        vdup.8          q15, r3                 @ flim_I
493
    .endif
494
        vdup.8          q14, r2                 @ flim_E
495

    
496
        vp8_loop_filter inner=\inner, simple=\simple
497

    
498
        @ back up to P2:  dst -= stride * 6
499
        sub             r0,  r0,  r1,  lsl #2
500
    .if !\simple
501
        sub             r0,  r0,  r1,  lsl #1
502

    
503
        @ Store pixels:
504
        vst1.8          {q1},     [r0,:128], r1 @ P2
505
    .endif
506
        vst1.8          {q2},     [r0,:128], r1 @ P1
507
        vst1.8          {q3},     [r0,:128], r1 @ P0
508
        vst1.8          {q4},     [r0,:128], r1 @ Q0
509
        vst1.8          {q5},     [r0,:128], r1 @ Q1
510
    .if !\simple
511
        vst1.8          {q6},     [r0,:128]     @ Q2
512
    .endif
513

    
514
        vpop            {q4-q7}
515
        bx              lr
516
endfunc
517
.endm
518

    
519
vp8_v_loop_filter16
520
vp8_v_loop_filter16 _inner,  inner=1
521
vp8_v_loop_filter16 _simple, simple=1
522

    
523
.macro  vp8_v_loop_filter8uv name, inner=0
524
function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
525
        vpush           {q4-q7}
526
        sub             r0,  r0,  r2,  lsl #2
527
        sub             r1,  r1,  r2,  lsl #2
528
        ldr             r12, [sp, #64]          @ flim_I
529

    
530
        @ Load pixels:
531
        vld1.8          {d0},     [r0,:64], r2  @ P3
532
        vld1.8          {d1},     [r1,:64], r2  @ P3
533
        vld1.8          {d2},     [r0,:64], r2  @ P2
534
        vld1.8          {d3},     [r1,:64], r2  @ P2
535
        vld1.8          {d4},     [r0,:64], r2  @ P1
536
        vld1.8          {d5},     [r1,:64], r2  @ P1
537
        vld1.8          {d6},     [r0,:64], r2  @ P0
538
        vld1.8          {d7},     [r1,:64], r2  @ P0
539
        vld1.8          {d8},     [r0,:64], r2  @ Q0
540
        vld1.8          {d9},     [r1,:64], r2  @ Q0
541
        vld1.8          {d10},    [r0,:64], r2  @ Q1
542
        vld1.8          {d11},    [r1,:64], r2  @ Q1
543
        vld1.8          {d12},    [r0,:64], r2  @ Q2
544
        vld1.8          {d13},    [r1,:64], r2  @ Q2
545
        vld1.8          {d14},    [r0,:64]      @ Q3
546
        vld1.8          {d15},    [r1,:64]      @ Q3
547

    
548
        vdup.8          q14, r3                 @ flim_E
549
        vdup.8          q15, r12                @ flim_I
550
        ldr             r12, [sp, #68]          @ hev_thresh
551

    
552
        vp8_loop_filter inner=\inner
553

    
554
        @ back up to P2:  u,v -= stride * 6
555
        sub             r0,  r0,  r2,  lsl #2
556
        sub             r1,  r1,  r2,  lsl #2
557
        sub             r0,  r0,  r2,  lsl #1
558
        sub             r1,  r1,  r2,  lsl #1
559

    
560
        @ Store pixels:
561
        vst1.8          {d2},     [r0,:64], r2  @ P2
562
        vst1.8          {d3},     [r1,:64], r2  @ P2
563
        vst1.8          {d4},     [r0,:64], r2  @ P1
564
        vst1.8          {d5},     [r1,:64], r2  @ P1
565
        vst1.8          {d6},     [r0,:64], r2  @ P0
566
        vst1.8          {d7},     [r1,:64], r2  @ P0
567
        vst1.8          {d8},     [r0,:64], r2  @ Q0
568
        vst1.8          {d9},     [r1,:64], r2  @ Q0
569
        vst1.8          {d10},    [r0,:64], r2  @ Q1
570
        vst1.8          {d11},    [r1,:64], r2  @ Q1
571
        vst1.8          {d12},    [r0,:64]      @ Q2
572
        vst1.8          {d13},    [r1,:64]      @ Q2
573

    
574
        vpop            {q4-q7}
575
        bx              lr
576
endfunc
577
.endm
578

    
579
vp8_v_loop_filter8uv
580
vp8_v_loop_filter8uv _inner, inner=1
581

    
582
.macro  vp8_h_loop_filter16 name, inner=0, simple=0
583
function ff_vp8_h_loop_filter16\name\()_neon, export=1
584
        vpush           {q4-q7}
585
        sub             r0,  r0,  #4
586
    .if !\simple
587
        ldr             r12, [sp, #64]          @ hev_thresh
588
    .endif
589

    
590
        @ Load pixels:
591
        vld1.8          {d0},     [r0], r1      @ load first 8-line src data
592
        vld1.8          {d2},     [r0], r1
593
        vld1.8          {d4},     [r0], r1
594
        vld1.8          {d6},     [r0], r1
595
        vld1.8          {d8},     [r0], r1
596
        vld1.8          {d10},    [r0], r1
597
        vld1.8          {d12},    [r0], r1
598
        vld1.8          {d14},    [r0], r1
599
        vld1.8          {d1},     [r0], r1      @ load second 8-line src data
600
        vld1.8          {d3},     [r0], r1
601
        vld1.8          {d5},     [r0], r1
602
        vld1.8          {d7},     [r0], r1
603
        vld1.8          {d9},     [r0], r1
604
        vld1.8          {d11},    [r0], r1
605
        vld1.8          {d13},    [r0], r1
606
        vld1.8          {d15},    [r0], r1
607

    
608
        transpose8x16matrix
609

    
610
        vdup.8          q14, r2                 @ flim_E
611
    .if !\simple
612
        vdup.8          q15, r3                 @ flim_I
613
    .endif
614

    
615
        vp8_loop_filter inner=\inner, simple=\simple
616

    
617
        sub             r0,  r0,  r1, lsl #4    @ backup 16 rows
618

    
619
        transpose8x16matrix
620

    
621
        @ Store pixels:
622
        vst1.8          {d0},     [r0],     r1
623
        vst1.8          {d2},     [r0],     r1
624
        vst1.8          {d4},     [r0],     r1
625
        vst1.8          {d6},     [r0],     r1
626
        vst1.8          {d8},     [r0],     r1
627
        vst1.8          {d10},    [r0],     r1
628
        vst1.8          {d12},    [r0],     r1
629
        vst1.8          {d14},    [r0],     r1
630
        vst1.8          {d1},     [r0],     r1
631
        vst1.8          {d3},     [r0],     r1
632
        vst1.8          {d5},     [r0],     r1
633
        vst1.8          {d7},     [r0],     r1
634
        vst1.8          {d9},     [r0],     r1
635
        vst1.8          {d11},    [r0],     r1
636
        vst1.8          {d13},    [r0],     r1
637
        vst1.8          {d15},    [r0]
638

    
639
        vpop            {q4-q7}
640
        bx              lr
641
endfunc
642
.endm
643

    
644
vp8_h_loop_filter16
645
vp8_h_loop_filter16 _inner,  inner=1
646
vp8_h_loop_filter16 _simple, simple=1
647

    
648
.macro  vp8_h_loop_filter8uv name, inner=0
649
function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
650
        vpush           {q4-q7}
651
        sub             r0,  r0,  #4
652
        sub             r1,  r1,  #4
653
        ldr             r12, [sp, #64]          @ flim_I
654

    
655
        @ Load pixels:
656
        vld1.8          {d0},     [r0], r2      @ load u
657
        vld1.8          {d1},     [r1], r2      @ load v
658
        vld1.8          {d2},     [r0], r2
659
        vld1.8          {d3},     [r1], r2
660
        vld1.8          {d4},     [r0], r2
661
        vld1.8          {d5},     [r1], r2
662
        vld1.8          {d6},     [r0], r2
663
        vld1.8          {d7},     [r1], r2
664
        vld1.8          {d8},     [r0], r2
665
        vld1.8          {d9},     [r1], r2
666
        vld1.8          {d10},    [r0], r2
667
        vld1.8          {d11},    [r1], r2
668
        vld1.8          {d12},    [r0], r2
669
        vld1.8          {d13},    [r1], r2
670
        vld1.8          {d14},    [r0], r2
671
        vld1.8          {d15},    [r1], r2
672

    
673
        transpose8x16matrix
674

    
675
        vdup.8          q14, r3                 @ flim_E
676
        vdup.8          q15, r12                @ flim_I
677
        ldr             r12, [sp, #68]          @ hev_thresh
678

    
679
        vp8_loop_filter inner=\inner
680

    
681
        sub             r0,  r0,  r2, lsl #3    @ backup u 8 rows
682
        sub             r1,  r1,  r2, lsl #3    @ backup v 8 rows
683

    
684
        transpose8x16matrix
685

    
686
        @ Store pixels:
687
        vst1.8          {d0},     [r0], r2
688
        vst1.8          {d1},     [r1], r2
689
        vst1.8          {d2},     [r0], r2
690
        vst1.8          {d3},     [r1], r2
691
        vst1.8          {d4},     [r0], r2
692
        vst1.8          {d5},     [r1], r2
693
        vst1.8          {d6},     [r0], r2
694
        vst1.8          {d7},     [r1], r2
695
        vst1.8          {d8},     [r0], r2
696
        vst1.8          {d9},     [r1], r2
697
        vst1.8          {d10},    [r0], r2
698
        vst1.8          {d11},    [r1], r2
699
        vst1.8          {d12},    [r0], r2
700
        vst1.8          {d13},    [r1], r2
701
        vst1.8          {d14},    [r0]
702
        vst1.8          {d15},    [r1]
703

    
704
        vpop            {q4-q7}
705
        bx              lr
706
endfunc
707
.endm
708

    
709
vp8_h_loop_filter8uv
710
vp8_h_loop_filter8uv _inner, inner=1
711

    
712
function ff_put_vp8_pixels16_neon, export=1
713
        ldr             r12, [sp, #0]           @ h
714
1:
715
        subs            r12, r12, #4
716
        vld1.8          {q0},     [r2], r3
717
        vld1.8          {q1},     [r2], r3
718
        vld1.8          {q2},     [r2], r3
719
        vld1.8          {q3},     [r2], r3
720
        vst1.8          {q0},     [r0,:128], r1
721
        vst1.8          {q1},     [r0,:128], r1
722
        vst1.8          {q2},     [r0,:128], r1
723
        vst1.8          {q3},     [r0,:128], r1
724
        bgt             1b
725
        bx              lr
726
endfunc
727

    
728
function ff_put_vp8_pixels8_neon, export=1
729
        ldr             r12, [sp, #0]           @ h
730
1:
731
        subs            r12, r12, #4
732
        vld1.8          {d0},     [r2], r3
733
        vld1.8          {d1},     [r2], r3
734
        vld1.8          {d2},     [r2], r3
735
        vld1.8          {d3},     [r2], r3
736
        vst1.8          {d0},     [r0,:64], r1
737
        vst1.8          {d1},     [r0,:64], r1
738
        vst1.8          {d2},     [r0,:64], r1
739
        vst1.8          {d3},     [r0,:64], r1
740
        bgt             1b
741
        bx              lr
742
endfunc
743

    
744
function ff_put_vp8_pixels4_neon, export=1
745
        ldr             r12, [sp, #0]           @ h
746
        push            {r4-r6,lr}
747
1:
748
        subs            r12, r12, #4
749
        ldr             r4,       [r2], r3
750
        ldr             r5,       [r2], r3
751
        ldr             r6,       [r2], r3
752
        ldr             lr,       [r2], r3
753
        str             r4,       [r0], r1
754
        str             r5,       [r0], r1
755
        str             r6,       [r0], r1
756
        str             lr,       [r0], r1
757
        bgt             1b
758
        pop             {r4-r6,pc}
759
endfunc
760

    
761
/* 4/6-tap 8th-pel MC */
762

    
763
.macro  vp8_epel8_h6    d,   a,   b
764
        vext.8          d27, \a,  \b,  #1
765
        vmovl.u8        q8,  \a
766
        vext.8          d28, \a,  \b,  #2
767
        vmovl.u8        q9,  d27
768
        vext.8          d29, \a,  \b,  #3
769
        vmovl.u8        q10, d28
770
        vext.8          d30, \a,  \b,  #4
771
        vmovl.u8        q11, d29
772
        vext.8          d31, \a,  \b,  #5
773
        vmovl.u8        q12, d30
774
        vmul.u16        q10, q10, d0[2]
775
        vmovl.u8        q13, d31
776
        vmul.u16        q11, q11, d0[3]
777
        vmls.u16        q10, q9,  d0[1]
778
        vmls.u16        q11, q12, d1[0]
779
        vmla.u16        q10, q8,  d0[0]
780
        vmla.u16        q11, q13, d1[1]
781
        vqadd.s16       q11, q10, q11
782
        vqrshrun.s16    \d,  q11, #7
783
.endm
784

    
785
.macro  vp8_epel16_h6   d0,  d1,  s0,  s1,  s2,  q0,  q1
786
        vext.8          q14, \q0, \q1, #3
787
        vext.8          q15, \q0, \q1, #4
788
        vmovl.u8        q11, d28
789
        vmovl.u8        q14, d29
790
        vext.8          q3,  \q0, \q1, #2
791
        vmovl.u8        q12, d30
792
        vmovl.u8        q15, d31
793
        vext.8          q8,  \q0, \q1, #1
794
        vmovl.u8        q10, d6
795
        vmovl.u8        q3,  d7
796
        vext.8          q2,  \q0, \q1, #5
797
        vmovl.u8        q13, d4
798
        vmovl.u8        q2,  d5
799
        vmovl.u8        q9,  d16
800
        vmovl.u8        q8,  d17
801
        vmul.u16        q11, q11, d0[3]
802
        vmul.u16        q10, q10, d0[2]
803
        vmul.u16        q3,  q3,  d0[2]
804
        vmul.u16        q14, q14, d0[3]
805
        vmls.u16        q11, q12, d1[0]
806
        vmovl.u8        q12, \s0
807
        vmovl.u8        q1,  \s1
808
        vmls.u16        q10, q9,  d0[1]
809
        vmls.u16        q3,  q8,  d0[1]
810
        vmls.u16        q14, q15, d1[0]
811
        vmla.u16        q10, q12, d0[0]
812
        vmla.u16        q11, q13, d1[1]
813
        vmla.u16        q3,  q1,  d0[0]
814
        vmla.u16        q14, q2,  d1[1]
815
        vqadd.s16       q11, q10, q11
816
        vqadd.s16       q14, q3,  q14
817
        vqrshrun.s16    \d0, q11, #7
818
        vqrshrun.s16    \d1, q14, #7
819
.endm
820

    
821
.macro  vp8_epel8_v6    d0,  s0,  s1,  s2,  s3,  s4,  s5
822
        vmovl.u8        q10, \s2
823
        vmovl.u8        q11, \s3
824
        vmovl.u8        q9,  \s1
825
        vmovl.u8        q12, \s4
826
        vmovl.u8        q8,  \s0
827
        vmovl.u8        q13, \s5
828
        vmul.u16        q10, q10, d0[2]
829
        vmul.u16        q11, q11, d0[3]
830
        vmls.u16        q10, q9,  d0[1]
831
        vmls.u16        q11, q12, d1[0]
832
        vmla.u16        q10, q8,  d0[0]
833
        vmla.u16        q11, q13, d1[1]
834
        vqadd.s16       q11, q10, q11
835
        vqrshrun.s16    \d0, q11, #7
836
.endm
837

    
838
.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
839
        vmovl.u8        q10, \s0
840
        vmovl.u8        q11, \s3
841
        vmovl.u8        q14, \s6
842
        vmovl.u8        q9,  \s1
843
        vmovl.u8        q12, \s4
844
        vmovl.u8        q8,  \s2
845
        vmovl.u8        q13, \s5
846
        vmul.u16        q10, q10, d0[0]
847
        vmul.u16        q15, q11, d0[3]
848
        vmul.u16        q11, q11, d0[2]
849
        vmul.u16        q14, q14, d1[1]
850
        vmls.u16        q10, q9,  d0[1]
851
        vmls.u16        q15, q12, d1[0]
852
        vmls.u16        q11, q8,  d0[1]
853
        vmls.u16        q14, q13, d1[0]
854
        vmla.u16        q10, q8,  d0[2]
855
        vmla.u16        q15, q13, d1[1]
856
        vmla.u16        q11, q9,  d0[0]
857
        vmla.u16        q14, q12, d0[3]
858
        vqadd.s16       q15, q10, q15
859
        vqadd.s16       q14, q11, q14
860
        vqrshrun.s16    \d0, q15, #7
861
        vqrshrun.s16    \d1, q14, #7
862
.endm
863

    
864
.macro  vp8_epel8_h4    d,   a,   b
865
        vext.8          d28, \a,  \b,  #1
866
        vmovl.u8        q9,  \a
867
        vext.8          d29, \a,  \b,  #2
868
        vmovl.u8        q10, d28
869
        vext.8          d30, \a,  \b,  #3
870
        vmovl.u8        q11, d29
871
        vmovl.u8        q12, d30
872
        vmul.u16        q10, q10, d0[2]
873
        vmul.u16        q11, q11, d0[3]
874
        vmls.u16        q10, q9,  d0[1]
875
        vmls.u16        q11, q12, d1[0]
876
        vqadd.s16       q11, q10, q11
877
        vqrshrun.s16    \d,  q11, #7
878
.endm
879

    
880
.macro  vp8_epel8_v4_y2 d0,  d1,  s0,  s1,  s2,  s3,  s4
881
        vmovl.u8        q9,  \s0
882
        vmovl.u8        q10, \s1
883
        vmovl.u8        q11, \s2
884
        vmovl.u8        q12, \s3
885
        vmovl.u8        q13, \s4
886
        vmul.u16        q8,  q10, d0[2]
887
        vmul.u16        q14, q11, d0[3]
888
        vmul.u16        q11, q11, d0[2]
889
        vmul.u16        q15, q12, d0[3]
890
        vmls.u16        q8,  q9,  d0[1]
891
        vmls.u16        q14, q12, d1[0]
892
        vmls.u16        q11, q10, d0[1]
893
        vmls.u16        q15, q13, d1[0]
894
        vqadd.s16       q8,  q8,  q14
895
        vqadd.s16       q11, q11, q15
896
        vqrshrun.s16    \d0, q8,  #7
897
        vqrshrun.s16    \d1, q11, #7
898
.endm
899

    
900
function ff_put_vp8_epel16_v6_neon, export=1
901
        sub             r2,  r2,  r3,  lsl #1
902
        push            {r4,lr}
903
        vpush           {d8-d15}
904

    
905
        ldr             r4,  [sp, #80]          @ my
906
        movrel          lr,  subpel_filters-16
907
        ldr             r12, [sp, #72]          @ h
908
        add             r4,  lr,  r4, lsl #4
909
        vld1.16         {q0},     [r4,:128]
910
1:
911
        vld1.8          {d2-d3},  [r2], r3
912
        vld1.8          {d4-d5},  [r2], r3
913
        vld1.8          {d6-d7},  [r2], r3
914
        vld1.8          {d8-d9},  [r2], r3
915
        vld1.8          {d10-d11},[r2], r3
916
        vld1.8          {d12-d13},[r2], r3
917
        vld1.8          {d14-d15},[r2]
918
        sub             r2,  r2,  r3,  lsl #2
919

    
920
        vp8_epel8_v6_y2 d2,  d4,  d2,  d4,  d6,  d8,  d10, d12, d14
921
        vp8_epel8_v6_y2 d3,  d5,  d3,  d5,  d7,  d9,  d11, d13, d15
922

    
923
        vst1.8          {d2-d3},  [r0,:128], r1
924
        vst1.8          {d4-d5},  [r0,:128], r1
925
        subs            r12, r12, #2
926
        bne             1b
927

    
928
        vpop            {d8-d15}
929
        pop             {r4,pc}
930
endfunc
931

    
932
function ff_put_vp8_epel16_h6_neon, export=1
933
        sub             r2,  r2,  #2
934
        push            {r4,lr}
935

    
936
        ldr             r4,  [sp, #12]          @ mx
937
        movrel          lr,  subpel_filters-16
938
        ldr             r12, [sp, #8]           @ h
939
        add             r4,  lr,  r4, lsl #4
940
        vld1.16         {q0},     [r4,:128]
941
1:
942
        vld1.8          {d2-d4},  [r2], r3
943

    
944
        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
945

    
946
        vst1.8          {d2-d3}, [r0,:128], r1
947
        subs            r12, r12, #1
948
        bne             1b
949

    
950
        pop             {r4,pc}
951
endfunc
952

    
953
function ff_put_vp8_epel16_h6v6_neon, export=1
954
        sub             r2,  r2,  r3,  lsl #1
955
        sub             r2,  r2,  #2
956
        push            {r4,lr}
957
        vpush           {d8-d9}
958

    
959
        @ first pass (horizontal):
960
        ldr             r4,  [sp, #28]          @ mx
961
        movrel          lr,  subpel_filters-16
962
        ldr             r12, [sp, #24]          @ h
963
        add             r4,  lr,  r4, lsl #4
964
        sub             sp,  sp,  #336+16
965
        vld1.16         {q0},     [r4,:128]
966
        add             lr,  sp,  #15
967
        add             r12, r12, #5
968
        bic             lr,  lr,  #15
969
1:
970
        vld1.8          {d2,d3,d4}, [r2], r3
971

    
972
        vp8_epel16_h6   d2,  d3,  d2,  d3,  d4,  q1,  q2
973

    
974
        vst1.8          {d2-d3}, [lr,:128]!
975
        subs            r12, r12, #1
976
        bne             1b
977

    
978
        @ second pass (vertical):
979
        ldr             r4,  [sp, #336+16+32]   @ my
980
        movrel          lr,  subpel_filters-16
981
        ldr             r12, [sp, #336+16+24]   @ h
982
        add             r4,  lr,  r4, lsl #4
983
        add             lr,  sp,  #15
984
        vld1.16         {q0},     [r4,:128]
985
        bic             lr,  lr,  #15
986
2:
987
        vld1.8          {d2-d5},  [lr,:128]!
988
        vld1.8          {d6-d9},  [lr,:128]!
989
        vld1.8          {d28-d31},[lr,:128]
990
        sub             lr,  lr,  #48
991

    
992
        vp8_epel8_v6    d2, d2, d4, d6, d8, d28, d30
993
        vp8_epel8_v6    d3, d3, d5, d7, d9, d29, d31
994

    
995
        vst1.8          {d2-d3}, [r0,:128], r1
996
        subs            r12, r12, #1
997
        bne             2b
998

    
999
        add             sp,  sp,  #336+16
1000
        vpop            {d8-d9}
1001
        pop             {r4,pc}
1002
endfunc
1003

    
1004
function ff_put_vp8_epel8_v6_neon, export=1
1005
        sub             r2,  r2,  r3,  lsl #1
1006
        push            {r4,lr}
1007

    
1008
        ldr             r4,  [sp, #16]          @ my
1009
        movrel          lr,  subpel_filters-16
1010
        ldr             r12, [sp, #8]           @ h
1011
        add             r4,  lr,  r4, lsl #4
1012
        vld1.16         {q0},     [r4,:128]
1013
1:
1014
        vld1.8          {d2},  [r2], r3
1015
        vld1.8          {d3},  [r2], r3
1016
        vld1.8          {d4},  [r2], r3
1017
        vld1.8          {d5},  [r2], r3
1018
        vld1.8          {d6},  [r2], r3
1019
        vld1.8          {d7},  [r2], r3
1020
        vld1.8          {d28}, [r2]
1021

    
1022
        sub             r2,  r2,  r3,  lsl #2
1023

    
1024
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1025

    
1026
        vst1.8          {d2}, [r0,:64], r1
1027
        vst1.8          {d3}, [r0,:64], r1
1028
        subs            r12, r12, #2
1029
        bne             1b
1030

    
1031
        pop             {r4,pc}
1032
endfunc
1033

    
1034
function ff_put_vp8_epel8_h6_neon, export=1
1035
        sub             r2,  r2,  #2
1036
        push            {r4,lr}
1037

    
1038
        ldr             r4,  [sp, #12]          @ mx
1039
        movrel          lr,  subpel_filters-16
1040
        ldr             r12, [sp, #8]           @ h
1041
        add             r4,  lr,  r4, lsl #4
1042
        vld1.16         {q0},     [r4,:128]
1043
1:
1044
        vld1.8          {d2,d3}, [r2], r3
1045

    
1046
        vp8_epel8_h6    d2,  d2,  d3
1047

    
1048
        vst1.8          {d2}, [r0,:64], r1
1049
        subs            r12, r12, #1
1050
        bne             1b
1051

    
1052
        pop             {r4,pc}
1053
endfunc
1054

    
1055
function ff_put_vp8_epel8_h6v6_neon, export=1
1056
        sub             r2,  r2,  r3,  lsl #1
1057
        sub             r2,  r2,  #2
1058
        push            {r4,lr}
1059

    
1060
        @ first pass (horizontal):
1061
        ldr             r4,  [sp, #12]          @ mx
1062
        movrel          lr,  subpel_filters-16
1063
        ldr             r12, [sp, #8]           @ h
1064
        add             r4,  lr,  r4, lsl #4
1065
        sub             sp,  sp,  #168+16
1066
        vld1.16         {q0},     [r4,:128]
1067
        add             lr,  sp,  #15
1068
        add             r12, r12, #5
1069
        bic             lr,  lr,  #15
1070
1:
1071
        vld1.8          {d2,d3}, [r2], r3
1072

    
1073
        vp8_epel8_h6    d2,  d2,  d3
1074

    
1075
        vst1.8          {d2}, [lr,:64]!
1076
        subs            r12, r12, #1
1077
        bne             1b
1078

    
1079
        @ second pass (vertical):
1080
        ldr             r4,  [sp, #168+16+16]   @ my
1081
        movrel          lr,  subpel_filters-16
1082
        ldr             r12, [sp, #168+16+8]    @ h
1083
        add             r4,  lr,  r4, lsl #4
1084
        add             lr,  sp,  #15
1085
        vld1.16         {q0},     [r4,:128]
1086
        bic             lr,  lr,  #15
1087
2:
1088
        vld1.8          {d2-d5},  [lr,:128]!
1089
        vld1.8          {d6-d7},  [lr,:128]!
1090
        vld1.8          {d30},    [lr,:64]
1091
        sub             lr,  lr,  #32
1092

    
1093
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1094

    
1095
        vst1.8          {d2}, [r0,:64], r1
1096
        vst1.8          {d3}, [r0,:64], r1
1097
        subs            r12, r12, #2
1098
        bne             2b
1099

    
1100
        add             sp,  sp,  #168+16
1101
        pop             {r4,pc}
1102
endfunc
1103

    
1104
function ff_put_vp8_epel8_v4_neon, export=1
1105
        sub             r2,  r2,  r3
1106
        push            {r4,lr}
1107

    
1108
        ldr             r4,  [sp, #16]          @ my
1109
        movrel          lr,  subpel_filters-16
1110
        ldr             r12, [sp, #8]           @ h
1111
        add             r4,  lr,  r4, lsl #4
1112
        vld1.16         {q0},     [r4,:128]
1113
1:
1114
        vld1.8          {d2},     [r2], r3
1115
        vld1.8          {d3},     [r2], r3
1116
        vld1.8          {d4},     [r2], r3
1117
        vld1.8          {d5},     [r2], r3
1118
        vld1.8          {d6},     [r2]
1119
        sub             r2,  r2,  r3,  lsl #1
1120

    
1121
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1122

    
1123
        vst1.8          {d2}, [r0,:64], r1
1124
        vst1.8          {d3}, [r0,:64], r1
1125
        subs            r12, r12, #2
1126
        bne             1b
1127

    
1128
        pop             {r4,pc}
1129
endfunc
1130

    
1131
function ff_put_vp8_epel8_h4_neon, export=1
1132
        sub             r2,  r2,  #1
1133
        push            {r4,lr}
1134

    
1135
        ldr             r4,  [sp, #12]          @ mx
1136
        movrel          lr,  subpel_filters-16
1137
        ldr             r12, [sp, #8]           @ h
1138
        add             r4,  lr,  r4, lsl #4
1139
        vld1.16         {q0},     [r4,:128]
1140
1:
1141
        vld1.8          {d2,d3}, [r2], r3
1142

    
1143
        vp8_epel8_h4    d2,  d2,  d3
1144

    
1145
        vst1.8          {d2}, [r0,:64], r1
1146
        subs            r12, r12, #1
1147
        bne             1b
1148

    
1149
        pop             {r4,pc}
1150
endfunc
1151

    
1152
function ff_put_vp8_epel8_h4v4_neon, export=1
1153
        sub             r2,  r2,  r3
1154
        sub             r2,  r2,  #1
1155
        push            {r4,lr}
1156

    
1157
        @ first pass (horizontal):
1158
        ldr             r4,  [sp, #12]          @ mx
1159
        movrel          lr,  subpel_filters-16
1160
        ldr             r12, [sp, #8]           @ h
1161
        add             r4,  lr,  r4, lsl #4
1162
        sub             sp,  sp,  #168+16
1163
        vld1.16         {q0},     [r4,:128]
1164
        add             lr,  sp,  #15
1165
        add             r12, r12, #3
1166
        bic             lr,  lr,  #15
1167
1:
1168
        vld1.8          {d2,d3}, [r2], r3
1169

    
1170
        vp8_epel8_h4    d2,  d2,  d3
1171

    
1172
        vst1.8          {d2}, [lr,:64]!
1173
        subs            r12, r12, #1
1174
        bne             1b
1175

    
1176
        @ second pass (vertical):
1177
        ldr             r4,  [sp, #168+16+16]   @ my
1178
        movrel          lr,  subpel_filters-16
1179
        ldr             r12, [sp, #168+16+8]    @ h
1180
        add             r4,  lr,  r4, lsl #4
1181
        add             lr,  sp,  #15
1182
        vld1.16         {q0},     [r4,:128]
1183
        bic             lr,  lr,  #15
1184
2:
1185
        vld1.8          {d2-d5},  [lr,:128]!
1186
        vld1.8          {d6},     [lr,:64]
1187
        sub             lr,  lr,  #16
1188

    
1189
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1190

    
1191
        vst1.8          {d2},     [r0,:64], r1
1192
        vst1.8          {d3},     [r0,:64], r1
1193
        subs            r12, r12, #2
1194
        bne             2b
1195

    
1196
        add             sp,  sp,  #168+16
1197
        pop             {r4,pc}
1198
endfunc
1199

    
1200
function ff_put_vp8_epel8_h6v4_neon, export=1
1201
        sub             r2,  r2,  r3
1202
        sub             r2,  r2,  #2
1203
        push            {r4,lr}
1204

    
1205
        @ first pass (horizontal):
1206
        ldr             r4,  [sp, #12]          @ mx
1207
        movrel          lr,  subpel_filters-16
1208
        ldr             r12, [sp, #8]           @ h
1209
        add             r4,  lr,  r4, lsl #4
1210
        sub             sp,  sp,  #168+16
1211
        vld1.16         {q0},     [r4,:128]
1212
        add             lr,  sp,  #15
1213
        add             r12, r12, #3
1214
        bic             lr,  lr,  #15
1215
1:
1216
        vld1.8          {d2,d3}, [r2], r3
1217

    
1218
        vp8_epel8_h6    d2,  d2,  d3
1219

    
1220
        vst1.8          {d2}, [lr,:64]!
1221
        subs            r12, r12, #1
1222
        bne             1b
1223

    
1224
        @ second pass (vertical):
1225
        ldr             r4,  [sp, #168+16+16]   @ my
1226
        movrel          lr,  subpel_filters-16
1227
        ldr             r12, [sp, #168+16+8]    @ h
1228
        add             r4,  lr,  r4, lsl #4
1229
        add             lr,  sp,  #15
1230
        vld1.16         {q0},     [r4,:128]
1231
        bic             lr,  lr,  #15
1232
2:
1233
        vld1.8          {d2-d5},  [lr,:128]!
1234
        vld1.8          {d6},     [lr,:64]
1235
        sub             lr,  lr,  #16
1236

    
1237
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1238

    
1239
        vst1.8          {d2},     [r0,:64], r1
1240
        vst1.8          {d3},     [r0,:64], r1
1241
        subs            r12, r12, #2
1242
        bne             2b
1243

    
1244
        add             sp,  sp,  #168+16
1245
        pop             {r4,pc}
1246
endfunc
1247

    
1248
function ff_put_vp8_epel8_h4v6_neon, export=1
1249
        sub             r2,  r2,  r3,  lsl #1
1250
        sub             r2,  r2,  #1
1251
        push            {r4,lr}
1252

    
1253
        @ first pass (horizontal):
1254
        ldr             r4,  [sp, #12]          @ mx
1255
        movrel          lr,  subpel_filters-16
1256
        ldr             r12, [sp, #8]           @ h
1257
        add             r4,  lr,  r4, lsl #4
1258
        sub             sp,  sp,  #168+16
1259
        vld1.16         {q0},     [r4,:128]
1260
        add             lr,  sp,  #15
1261
        add             r12, r12, #5
1262
        bic             lr,  lr,  #15
1263
1:
1264
        vld1.8          {d2,d3}, [r2], r3
1265

    
1266
        vp8_epel8_h4    d2,  d2,  d3
1267

    
1268
        vst1.8          {d2}, [lr,:64]!
1269
        subs            r12, r12, #1
1270
        bne             1b
1271

    
1272
        @ second pass (vertical):
1273
        ldr             r4,  [sp, #168+16+16]   @ my
1274
        movrel          lr,  subpel_filters-16
1275
        ldr             r12, [sp, #168+16+8]    @ h
1276
        add             r4,  lr,  r4, lsl #4
1277
        add             lr,  sp,  #15
1278
        vld1.16         {q0},     [r4,:128]
1279
        bic             lr,  lr,  #15
1280
2:
1281
        vld1.8          {d2-d5},  [lr,:128]!
1282
        vld1.8          {d6-d7},  [lr,:128]!
1283
        vld1.8          {d30},    [lr,:64]
1284
        sub             lr,  lr,  #32
1285

    
1286
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d30
1287

    
1288
        vst1.8          {d2}, [r0,:64], r1
1289
        vst1.8          {d3}, [r0,:64], r1
1290
        subs            r12, r12, #2
1291
        bne             2b
1292

    
1293
        add             sp,  sp,  #168+16
1294
        pop             {r4,pc}
1295
endfunc
1296

    
1297
.ltorg
1298

    
1299
function ff_put_vp8_epel4_v6_neon, export=1
1300
        sub             r2,  r2,  r3,  lsl #1
1301
        push            {r4,lr}
1302

    
1303
        ldr             r4,  [sp, #16]          @ my
1304
        movrel          lr,  subpel_filters-16
1305
        ldr             r12, [sp, #8]           @ h
1306
        add             r4,  lr,  r4, lsl #4
1307
        vld1.16         {q0},     [r4,:128]
1308
1:
1309
        vld1.32         {d2[]},   [r2], r3
1310
        vld1.32         {d3[]},   [r2], r3
1311
        vld1.32         {d4[]},   [r2], r3
1312
        vld1.32         {d5[]},   [r2], r3
1313
        vld1.32         {d6[]},   [r2], r3
1314
        vld1.32         {d7[]},   [r2], r3
1315
        vld1.32         {d28[]},  [r2]
1316
        sub             r2,  r2,  r3,  lsl #2
1317
        vld1.32         {d2[1]},  [r2], r3
1318
        vld1.32         {d3[1]},  [r2], r3
1319
        vld1.32         {d4[1]},  [r2], r3
1320
        vld1.32         {d5[1]},  [r2], r3
1321
        vld1.32         {d6[1]},  [r2], r3
1322
        vld1.32         {d7[1]},  [r2], r3
1323
        vld1.32         {d28[1]}, [r2]
1324
        sub             r2,  r2,  r3,  lsl #2
1325

    
1326
        vp8_epel8_v6_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6,  d7,  d28
1327

    
1328
        vst1.32         {d2[0]},  [r0,:32], r1
1329
        vst1.32         {d3[0]},  [r0,:32], r1
1330
        vst1.32         {d2[1]},  [r0,:32], r1
1331
        vst1.32         {d3[1]},  [r0,:32], r1
1332
        subs            r12, r12, #4
1333
        bne             1b
1334

    
1335
        pop             {r4,pc}
1336
endfunc
1337

    
1338
function ff_put_vp8_epel4_h6_neon, export=1
1339
        sub             r2,  r2,  #2
1340
        push            {r4,lr}
1341

    
1342
        ldr             r4,  [sp, #12]          @ mx
1343
        movrel          lr,  subpel_filters-16
1344
        ldr             r12, [sp, #8]           @ h
1345
        add             r4,  lr,  r4, lsl #4
1346
        vld1.16         {q0},     [r4,:128]
1347
1:
1348
        vld1.8          {q1},     [r2], r3
1349
        vp8_epel8_h6    d2,  d2,  d3
1350
        vst1.32         {d2[0]},  [r0,:32], r1
1351
        subs            r12, r12, #1
1352
        bne             1b
1353

    
1354
        pop             {r4,pc}
1355
endfunc
1356

    
1357
function ff_put_vp8_epel4_h6v6_neon, export=1
1358
        sub             r2,  r2,  r3,  lsl #1
1359
        sub             r2,  r2,  #2
1360
        push            {r4,lr}
1361

    
1362
        ldr             r4,  [sp, #12]          @ mx
1363
        movrel          lr,  subpel_filters-16
1364
        ldr             r12, [sp, #8]           @ h
1365
        add             r4,  lr,  r4, lsl #4
1366
        sub             sp,  sp,  #52+16
1367
        vld1.16         {q0},     [r4,:128]
1368
        add             lr,  sp,  #15
1369
        add             r12, r12, #5
1370
        bic             lr,  lr,  #15
1371
1:
1372
        vld1.8          {q1},     [r2], r3
1373
        vp8_epel8_h6    d2,  d2,  d3
1374
        vst1.32         {d2[0]},  [lr,:32]!
1375
        subs            r12, r12, #1
1376
        bne             1b
1377

    
1378
        ldr             r4,  [sp, #52+16+16]    @ my
1379
        movrel          lr,  subpel_filters-16
1380
        ldr             r12, [sp, #52+16+8]     @ h
1381
        add             r4,  lr,  r4, lsl #4
1382
        add             lr,  sp,  #15
1383
        vld1.16         {q0},     [r4,:128]
1384
        bic             lr,  lr,  #15
1385
2:
1386
        vld1.8          {d2-d3},  [lr,:128]!
1387
        vld1.8          {d6},     [lr,:64]!
1388
        vld1.32         {d28[]},  [lr,:32]
1389
        sub             lr,  lr,  #16
1390
        vld1.8          {d4-d5},  [lr]!
1391
        vld1.8          {d7},     [lr,:64]!
1392
        vld1.32         {d28[1]}, [lr,:32]
1393
        sub             lr,  lr,  #16
1394
        vtrn.32         q1,  q2
1395
        vtrn.32         d6,  d7
1396
        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1397
        vst1.32         {d2[0]},  [r0,:32], r1
1398
        vst1.32         {d3[0]},  [r0,:32], r1
1399
        vst1.32         {d2[1]},  [r0,:32], r1
1400
        vst1.32         {d3[1]},  [r0,:32], r1
1401
        subs            r12, r12, #4
1402
        bne             2b
1403

    
1404
        add             sp,  sp,  #52+16
1405
        pop             {r4,pc}
1406
endfunc
1407

    
1408
function ff_put_vp8_epel4_h4v6_neon, export=1
1409
        sub             r2,  r2,  r3,  lsl #1
1410
        sub             r2,  r2,  #1
1411
        push            {r4,lr}
1412

    
1413
        ldr             r4,  [sp, #12]          @ mx
1414
        movrel          lr,  subpel_filters-16
1415
        ldr             r12, [sp, #8]           @ h
1416
        add             r4,  lr,  r4, lsl #4
1417
        sub             sp,  sp,  #52+16
1418
        vld1.16         {q0},     [r4,:128]
1419
        add             lr,  sp,  #15
1420
        add             r12, r12, #5
1421
        bic             lr,  lr,  #15
1422
1:
1423
        vld1.8          {d2},     [r2], r3
1424
        vp8_epel8_h4    d2,  d2,  d2
1425
        vst1.32         {d2[0]},  [lr,:32]!
1426
        subs            r12, r12, #1
1427
        bne             1b
1428

    
1429
        ldr             r4,  [sp, #52+16+16]    @ my
1430
        movrel          lr,  subpel_filters-16
1431
        ldr             r12, [sp, #52+16+8]     @ h
1432
        add             r4,  lr,  r4, lsl #4
1433
        add             lr,  sp,  #15
1434
        vld1.16         {q0},     [r4,:128]
1435
        bic             lr,  lr,  #15
1436
2:
1437
        vld1.8          {d2-d3},  [lr,:128]!
1438
        vld1.8          {d6},     [lr,:64]!
1439
        vld1.32         {d28[]},  [lr,:32]
1440
        sub             lr,  lr,  #16
1441
        vld1.8          {d4-d5},  [lr]!
1442
        vld1.8          {d7},     [lr,:64]!
1443
        vld1.32         {d28[1]}, [lr,:32]
1444
        sub             lr,  lr,  #16
1445
        vtrn.32         q1,  q2
1446
        vtrn.32         d6,  d7
1447
        vp8_epel8_v6_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6,  d7,  d28
1448
        vst1.32         {d2[0]},  [r0,:32], r1
1449
        vst1.32         {d3[0]},  [r0,:32], r1
1450
        vst1.32         {d2[1]},  [r0,:32], r1
1451
        vst1.32         {d3[1]},  [r0,:32], r1
1452
        subs            r12, r12, #4
1453
        bne             2b
1454

    
1455
        add             sp,  sp,  #52+16
1456
        pop             {r4,pc}
1457
endfunc
1458

    
1459
function ff_put_vp8_epel4_h6v4_neon, export=1
1460
        sub             r2,  r2,  r3
1461
        sub             r2,  r2,  #2
1462
        push            {r4,lr}
1463

    
1464
        ldr             r4,  [sp, #12]          @ mx
1465
        movrel          lr,  subpel_filters-16
1466
        ldr             r12, [sp, #8]           @ h
1467
        add             r4,  lr,  r4, lsl #4
1468
        sub             sp,  sp,  #44+16
1469
        vld1.16         {q0},     [r4,:128]
1470
        add             lr,  sp,  #15
1471
        add             r12, r12, #3
1472
        bic             lr,  lr,  #15
1473
1:
1474
        vld1.8          {q1},     [r2], r3
1475
        vp8_epel8_h6    d2,  d2,  d3
1476
        vst1.32         {d2[0]},  [lr,:32]!
1477
        subs            r12, r12, #1
1478
        bne             1b
1479

    
1480
        ldr             r4,  [sp, #44+16+16]    @ my
1481
        movrel          lr,  subpel_filters-16
1482
        ldr             r12, [sp, #44+16+8]     @ h
1483
        add             r4,  lr,  r4, lsl #4
1484
        add             lr,  sp,  #15
1485
        vld1.16         {q0},     [r4,:128]
1486
        bic             lr,  lr,  #15
1487
2:
1488
        vld1.8          {d2-d3},  [lr,:128]!
1489
        vld1.32         {d6[]},   [lr,:32]
1490
        sub             lr,  lr,  #8
1491
        vld1.8          {d4-d5},  [lr]!
1492
        vld1.32         {d6[1]},  [lr,:32]
1493
        sub             lr,  lr,  #8
1494
        vtrn.32         q1,  q2
1495
        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1496
        vst1.32         {d2[0]},  [r0,:32], r1
1497
        vst1.32         {d3[0]},  [r0,:32], r1
1498
        vst1.32         {d2[1]},  [r0,:32], r1
1499
        vst1.32         {d3[1]},  [r0,:32], r1
1500
        subs            r12, r12, #4
1501
        bne             2b
1502

    
1503
        add             sp,  sp,  #44+16
1504
        pop             {r4,pc}
1505
endfunc
1506

    
1507
function ff_put_vp8_epel4_h4_neon, export=1
1508
        sub             r2,  r2,  #1
1509
        push            {r4,lr}
1510

    
1511
        ldr             r4,  [sp, #12]          @ mx
1512
        movrel          lr,  subpel_filters-16
1513
        ldr             r12, [sp, #8]           @ h
1514
        add             r4,  lr,  r4, lsl #4
1515
        vld1.16         {q0},     [r4,:128]
1516
1:
1517
        vld1.8          {d2},     [r2], r3
1518
        vp8_epel8_h4    d2,  d2,  d2
1519
        vst1.32         {d2[0]},  [r0,:32], r1
1520
        subs            r12, r12, #1
1521
        bne             1b
1522

    
1523
        pop             {r4,pc}
1524
endfunc
1525

    
1526
function ff_put_vp8_epel4_v4_neon, export=1
1527
        sub             r2,  r2,  r3
1528
        push            {r4,lr}
1529

    
1530
        ldr             r4,  [sp, #16]          @ my
1531
        movrel          lr,  subpel_filters-16
1532
        ldr             r12, [sp, #8]           @ h
1533
        add             r4,  lr,  r4, lsl #4
1534
        vld1.16         {q0},     [r4,:128]
1535
1:
1536
        vld1.32         {d2[]},   [r2], r3
1537
        vld1.32         {d3[]},   [r2], r3
1538
        vld1.32         {d4[]},   [r2], r3
1539
        vld1.32         {d5[]},   [r2], r3
1540
        vld1.32         {d6[]},   [r2]
1541
        sub             r2,  r2,  r3,  lsl #1
1542
        vld1.32         {d2[1]},  [r2], r3
1543
        vld1.32         {d3[1]},  [r2], r3
1544
        vld1.32         {d4[1]},  [r2], r3
1545
        vld1.32         {d5[1]},  [r2], r3
1546
        vld1.32         {d6[1]},  [r2]
1547
        sub             r2,  r2,  r3,  lsl #1
1548

    
1549
        vp8_epel8_v4_y2 d2,  d3,  d2,  d3,  d4,  d5,  d6
1550

    
1551
        vst1.32         {d2[0]},  [r0,:32], r1
1552
        vst1.32         {d3[0]},  [r0,:32], r1
1553
        vst1.32         {d2[1]},  [r0,:32], r1
1554
        vst1.32         {d3[1]},  [r0,:32], r1
1555
        subs            r12, r12, #4
1556
        bne             1b
1557

    
1558
        pop             {r4,pc}
1559
endfunc
1560

    
1561
function ff_put_vp8_epel4_h4v4_neon, export=1
1562
        sub             r2,  r2,  r3
1563
        sub             r2,  r2,  #1
1564
        push            {r4,lr}
1565

    
1566
        ldr             r4,  [sp, #12]          @ mx
1567
        movrel          lr,  subpel_filters-16
1568
        ldr             r12, [sp, #8]           @ h
1569
        add             r4,  lr,  r4, lsl #4
1570
        sub             sp,  sp,  #44+16
1571
        vld1.16         {q0},     [r4,:128]
1572
        add             lr,  sp,  #15
1573
        add             r12, r12, #3
1574
        bic             lr,  lr,  #15
1575
1:
1576
        vld1.8          {d2},     [r2], r3
1577
        vp8_epel8_h4    d2,  d2,  d3
1578
        vst1.32         {d2[0]},  [lr,:32]!
1579
        subs            r12, r12, #1
1580
        bne             1b
1581

    
1582
        ldr             r4,  [sp, #44+16+16]    @ my
1583
        movrel          lr,  subpel_filters-16
1584
        ldr             r12, [sp, #44+16+8]     @ h
1585
        add             r4,  lr,  r4, lsl #4
1586
        add             lr,  sp,  #15
1587
        vld1.16         {q0},     [r4,:128]
1588
        bic             lr,  lr,  #15
1589
2:
1590
        vld1.8          {d2-d3},  [lr,:128]!
1591
        vld1.32         {d6[]},   [lr,:32]
1592
        sub             lr,  lr,  #8
1593
        vld1.8          {d4-d5},  [lr]!
1594
        vld1.32         {d6[1]},  [lr,:32]
1595
        sub             lr,  lr,  #8
1596
        vtrn.32         q1,  q2
1597
        vp8_epel8_v4_y2 d2,  d3,  d2,  d4,  d3,  d5,  d6
1598
        vst1.32         {d2[0]},  [r0,:32], r1
1599
        vst1.32         {d3[0]},  [r0,:32], r1
1600
        vst1.32         {d2[1]},  [r0,:32], r1
1601
        vst1.32         {d3[1]},  [r0,:32], r1
1602
        subs            r12, r12, #4
1603
        bne             2b
1604

    
1605
        add             sp,  sp,  #44+16
1606
        pop             {r4,pc}
1607
endfunc
1608

    
1609
@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1610
@ arithmatic can be used to apply filters
1611
const   subpel_filters, align=4
1612
        .short     0,   6, 123,  12,   1,   0,   0,   0
1613
        .short     2,  11, 108,  36,   8,   1,   0,   0
1614
        .short     0,   9,  93,  50,   6,   0,   0,   0
1615
        .short     3,  16,  77,  77,  16,   3,   0,   0
1616
        .short     0,   6,  50,  93,   9,   0,   0,   0
1617
        .short     1,   8,  36, 108,  11,   2,   0,   0
1618
        .short     0,   1,  12, 123,   6,   0,   0,   0
1619
endconst
1620

    
1621
/* Bilinear MC */
1622

    
1623
function ff_put_vp8_bilin16_h_neon, export=1
1624
        ldr             r3,  [sp, #4]           @ mx
1625
        rsb             r12, r3,  #8
1626
        vdup.8          d0,  r3
1627
        vdup.8          d1,  r12
1628
        ldr             r12, [sp]               @ h
1629
1:
1630
        subs            r12, r12, #2
1631
        vld1.8          {d2-d4},  [r2], r1
1632
        vext.8          q2,  q1,  q2,  #1
1633
        vmull.u8        q8,  d2,  d1
1634
        vmlal.u8        q8,  d4,  d0
1635
        vld1.8          {d18-d20},[r2], r1
1636
        vmull.u8        q3,  d3,  d1
1637
        vmlal.u8        q3,  d5,  d0
1638
        vext.8          q10, q9,  q10, #1
1639
        vmull.u8        q11, d18, d1
1640
        vmlal.u8        q11, d20, d0
1641
        vmull.u8        q12, d19, d1
1642
        vmlal.u8        q12, d21, d0
1643
        vrshrn.u16      d4,  q8,  #3
1644
        vrshrn.u16      d5,  q3,  #3
1645
        vrshrn.u16      d6,  q11, #3
1646
        vrshrn.u16      d7,  q12, #3
1647
        vst1.8          {q2},     [r0,:128], r1
1648
        vst1.8          {q3},     [r0,:128], r1
1649
        bgt             1b
1650

    
1651
        bx              lr
1652
endfunc
1653

    
1654
function ff_put_vp8_bilin16_v_neon, export=1
1655
        ldr             r3,  [sp, #8]           @ my
1656
        rsb             r12, r3,  #8
1657
        vdup.8          d0,  r3
1658
        vdup.8          d1,  r12
1659
        ldr             r12, [sp]               @ h
1660
        vld1.8          {q1},     [r2], r1
1661
1:
1662
        subs            r12, r12, #2
1663
        vld1.8          {q2},     [r2], r1
1664
        vmull.u8        q3,  d2,  d1
1665
        vmlal.u8        q3,  d4,  d0
1666
        vmull.u8        q8,  d3,  d1
1667
        vmlal.u8        q8,  d5,  d0
1668
        vld1.8          {q1},     [r2], r1
1669
        vmull.u8        q9,  d4,  d1
1670
        vmlal.u8        q9,  d2,  d0
1671
        vmull.u8        q10, d5,  d1
1672
        vmlal.u8        q10, d3,  d0
1673
        vrshrn.u16      d4,  q3,  #3
1674
        vrshrn.u16      d5,  q8,  #3
1675
        vrshrn.u16      d6,  q9,  #3
1676
        vrshrn.u16      d7,  q10, #3
1677
        vst1.8          {q2},     [r0,:128], r1
1678
        vst1.8          {q3},     [r0,:128], r1
1679
        bgt             1b
1680

    
1681
        bx              lr
1682
endfunc
1683

    
1684
function ff_put_vp8_bilin16_hv_neon, export=1
1685
        ldr             r3,  [sp, #4]           @ mx
1686
        rsb             r12, r3,  #8
1687
        vdup.8          d0,  r3
1688
        vdup.8          d1,  r12
1689
        ldr             r3,  [sp, #8]           @ my
1690
        rsb             r12, r3,  #8
1691
        vdup.8          d2,  r3
1692
        vdup.8          d3,  r12
1693
        ldr             r12, [sp]               @ h
1694

    
1695
        vld1.8          {d4-d6},  [r2], r1
1696
        vext.8          q3,  q2,  q3,  #1
1697
        vmull.u8        q8,  d4,  d1
1698
        vmlal.u8        q8,  d6,  d0
1699
        vmull.u8        q9,  d5,  d1
1700
        vmlal.u8        q9,  d7,  d0
1701
        vrshrn.u16      d4,  q8,  #3
1702
        vrshrn.u16      d5,  q9,  #3
1703
1:
1704
        subs            r12, r12, #2
1705
        vld1.8          {d18-d20},[r2], r1
1706
        vext.8          q10, q9,  q10, #1
1707
        vmull.u8        q11, d18, d1
1708
        vmlal.u8        q11, d20, d0
1709
        vld1.8          {d26-d28},[r2], r1
1710
        vmull.u8        q12, d19, d1
1711
        vmlal.u8        q12, d21, d0
1712
        vext.8          q14, q13, q14, #1
1713
        vmull.u8        q8,  d26, d1
1714
        vmlal.u8        q8,  d28, d0
1715
        vmull.u8        q9,  d27, d1
1716
        vmlal.u8        q9,  d29, d0
1717
        vrshrn.u16      d6,  q11, #3
1718
        vrshrn.u16      d7,  q12, #3
1719
        vmull.u8        q12, d4,  d3
1720
        vmlal.u8        q12, d6,  d2
1721
        vmull.u8        q15, d5,  d3
1722
        vmlal.u8        q15, d7,  d2
1723
        vrshrn.u16      d4,  q8,  #3
1724
        vrshrn.u16      d5,  q9,  #3
1725
        vmull.u8        q10, d6,  d3
1726
        vmlal.u8        q10, d4,  d2
1727
        vmull.u8        q11, d7,  d3
1728
        vmlal.u8        q11, d5,  d2
1729
        vrshrn.u16      d24, q12, #3
1730
        vrshrn.u16      d25, q15, #3
1731
        vst1.8          {q12},    [r0,:128], r1
1732
        vrshrn.u16      d20, q10, #3
1733
        vrshrn.u16      d21, q11, #3
1734
        vst1.8          {q10},    [r0,:128], r1
1735
        bgt             1b
1736

    
1737
        bx              lr
1738
endfunc
1739

    
1740
function ff_put_vp8_bilin8_h_neon, export=1
1741
        ldr             r3,  [sp, #4]           @ mx
1742
        rsb             r12, r3,  #8
1743
        vdup.8          d0,  r3
1744
        vdup.8          d1,  r12
1745
        ldr             r12, [sp]               @ h
1746
1:
1747
        subs            r12, r12, #2
1748
        vld1.8          {q1},     [r2], r1
1749
        vext.8          d3,  d2,  d3,  #1
1750
        vmull.u8        q2,  d2,  d1
1751
        vmlal.u8        q2,  d3,  d0
1752
        vld1.8          {q3},     [r2], r1
1753
        vext.8          d7,  d6,  d7,  #1
1754
        vmull.u8        q8,  d6,  d1
1755
        vmlal.u8        q8,  d7,  d0
1756
        vrshrn.u16      d4,  q2,  #3
1757
        vrshrn.u16      d16, q8,  #3
1758
        vst1.8          {d4},     [r0,:64], r1
1759
        vst1.8          {d16},    [r0,:64], r1
1760
        bgt             1b
1761

    
1762
        bx              lr
1763
endfunc
1764

    
1765
function ff_put_vp8_bilin8_v_neon, export=1
1766
        ldr             r3,  [sp, #8]           @ my
1767
        rsb             r12, r3,  #8
1768
        vdup.8          d0,  r3
1769
        vdup.8          d1,  r12
1770
        ldr             r12, [sp]               @ h
1771
        vld1.8          {d2},     [r2], r1
1772
1:
1773
        subs            r12, r12, #2
1774
        vld1.8          {d3},     [r2], r1
1775
        vmull.u8        q2,  d2,  d1
1776
        vmlal.u8        q2,  d3,  d0
1777
        vld1.8          {d2},     [r2], r1
1778
        vmull.u8        q3,  d3,  d1
1779
        vmlal.u8        q3,  d2,  d0
1780
        vrshrn.u16      d4,  q2,  #3
1781
        vrshrn.u16      d6,  q3,  #3
1782
        vst1.8          {d4},     [r0,:64], r1
1783
        vst1.8          {d6},     [r0,:64], r1
1784
        bgt             1b
1785

    
1786
        bx              lr
1787
endfunc
1788

    
1789
function ff_put_vp8_bilin8_hv_neon, export=1
1790
        ldr             r3,  [sp, #4]           @ mx
1791
        rsb             r12, r3,  #8
1792
        vdup.8          d0,  r3
1793
        vdup.8          d1,  r12
1794
        ldr             r3,  [sp, #8]           @ my
1795
        rsb             r12, r3,  #8
1796
        vdup.8          d2,  r3
1797
        vdup.8          d3,  r12
1798
        ldr             r12, [sp]               @ h
1799

    
1800
        vld1.8          {q2},     [r2], r1
1801
        vext.8          d5,  d4,  d5,  #1
1802
        vmull.u8        q9,  d4,  d1
1803
        vmlal.u8        q9,  d5,  d0
1804
        vrshrn.u16      d22, q9,  #3
1805
1:
1806
        subs            r12, r12, #2
1807
        vld1.8          {q3},     [r2], r1
1808
        vext.8          d7,  d6,  d7,  #1
1809
        vmull.u8        q8,  d6,  d1
1810
        vmlal.u8        q8,  d7,  d0
1811
        vld1.8          {q2},     [r2], r1
1812
        vext.8          d5,  d4,  d5,  #1
1813
        vmull.u8        q9,  d4,  d1
1814
        vmlal.u8        q9,  d5,  d0
1815
        vrshrn.u16      d16, q8,  #3
1816
        vmull.u8        q10, d22, d3
1817
        vmlal.u8        q10, d16, d2
1818
        vrshrn.u16      d22, q9,  #3
1819
        vmull.u8        q12, d16, d3
1820
        vmlal.u8        q12, d22, d2
1821
        vrshrn.u16      d20, q10, #3
1822
        vst1.8          {d20},    [r0,:64], r1
1823
        vrshrn.u16      d23, q12, #3
1824
        vst1.8          {d23},    [r0,:64], r1
1825
        bgt             1b
1826

    
1827
        bx              lr
1828
endfunc
1829

    
1830
function ff_put_vp8_bilin4_h_neon, export=1
1831
        ldr             r3,  [sp, #4]           @ mx
1832
        rsb             r12, r3,  #8
1833
        vdup.8          d0,  r3
1834
        vdup.8          d1,  r12
1835
        ldr             r12, [sp]               @ h
1836
1:
1837
        subs            r12, r12, #2
1838
        vld1.8          {d2},     [r2], r1
1839
        vext.8          d3,  d2,  d3,  #1
1840
        vld1.8          {d6},     [r2], r1
1841
        vext.8          d7,  d6,  d7,  #1
1842
        vtrn.32         q1,  q3
1843
        vmull.u8        q2,  d2,  d1
1844
        vmlal.u8        q2,  d3,  d0
1845
        vrshrn.u16      d4,  q2,  #3
1846
        vst1.32         {d4[0]},  [r0,:32], r1
1847
        vst1.32         {d4[1]}, [r0,:32], r1
1848
        bgt             1b
1849

    
1850
        bx              lr
1851
endfunc
1852

    
1853
function ff_put_vp8_bilin4_v_neon, export=1
1854
        ldr             r3,  [sp, #8]           @ my
1855
        rsb             r12, r3,  #8
1856
        vdup.8          d0,  r3
1857
        vdup.8          d1,  r12
1858
        ldr             r12, [sp]               @ h
1859
        vld1.32         {d2[]},   [r2], r1
1860
1:
1861
        vld1.32         {d3[]},   [r2]
1862
        vld1.32         {d2[1]},  [r2], r1
1863
        vld1.32         {d3[1]},  [r2], r1
1864
        vmull.u8        q2,  d2,  d1
1865
        vmlal.u8        q2,  d3,  d0
1866
        vtrn.32         d3,  d2
1867
        vrshrn.u16      d4,  q2,  #3
1868
        vst1.32         {d4[0]},  [r0,:32], r1
1869
        vst1.32         {d4[1]},  [r0,:32], r1
1870
        subs            r12, r12, #2
1871
        bgt             1b
1872

    
1873
        bx              lr
1874
endfunc
1875

    
1876
function ff_put_vp8_bilin4_hv_neon, export=1
1877
        ldr             r3,  [sp, #4]           @ mx
1878
        rsb             r12, r3,  #8
1879
        vdup.8          d0,  r3
1880
        vdup.8          d1,  r12
1881
        ldr             r3,  [sp, #8]           @ my
1882
        rsb             r12, r3,  #8
1883
        vdup.8          d2,  r3
1884
        vdup.8          d3,  r12
1885
        ldr             r12, [sp]               @ h
1886

    
1887
        vld1.8          {d4},     [r2], r1
1888
        vext.8          d5,  d4,  d4,  #1
1889
        vmull.u8        q9,  d4,  d1
1890
        vmlal.u8        q9,  d5,  d0
1891
        vrshrn.u16      d22, q9,  #3
1892
1:
1893
        subs            r12, r12, #2
1894
        vld1.8          {d6},     [r2], r1
1895
        vext.8          d7,  d6,  d6,  #1
1896
        vld1.8          {d4},     [r2], r1
1897
        vext.8          d5,  d4,  d4,  #1
1898
        vtrn.32         q3,  q2
1899
        vmull.u8        q8,  d6,  d1
1900
        vmlal.u8        q8,  d7,  d0
1901
        vrshrn.u16      d16, q8,  #3
1902
        vmull.u8        q10, d16, d2
1903
        vtrn.32         d22, d16
1904
        vmlal.u8        q10, d22, d3
1905
        vrev64.32       d22, d16
1906
        vrshrn.u16      d20, q10, #3
1907
        vst1.32         {d20[0]}, [r0,:32], r1
1908
        vst1.32         {d20[1]}, [r0,:32], r1
1909
        bgt             1b
1910

    
1911
        bx              lr
1912
endfunc