## ffmpeg / libavcodec / arm / vp8dsp_neon.S @ ef15d71c

1 | ef15d71c | Mans Rullgard | /** |
2 | * VP8 NEON optimisations |
3 | * |
4 | * Copyright (c) 2010 Rob Clark <rob@ti.com> |
5 | * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> |
6 | * |
7 | * This file is part of FFmpeg. |
8 | * |
9 | * FFmpeg is free software; you can redistribute it and/or |
10 | * modify it under the terms of the GNU Lesser General Public |
11 | * License as published by the Free Software Foundation; either |
12 | * version 2.1 of the License, or (at your option) any later version. |
13 | * |
14 | * FFmpeg is distributed in the hope that it will be useful, |
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
17 | * Lesser General Public License for more details. |
18 | * |
19 | * You should have received a copy of the GNU Lesser General Public |
20 | * License along with FFmpeg; if not, write to the Free Software |
21 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
22 | */ |
23 | |||

24 | #include "asm.S" |
26 | function ff_vp8_luma_dc_wht_neon, export=1 |
27 | vld1.16 {q0-q1}, [r1,:128] |
28 | vmov.i16 q15, #0 |
30 | vadd.i16 d4, d0, d3 |
31 | vadd.i16 d6, d1, d2 |
32 | vst1.16 {q15}, [r1,:128]! |
33 | vsub.i16 d7, d1, d2 |
34 | vsub.i16 d5, d0, d3 |
35 | vst1.16 {q15}, [r1,:128] |
36 | vadd.i16 q0, q2, q3 |
37 | vsub.i16 q1, q2, q3 |
39 | vmov.i16 q8, #3 |
41 | vtrn.32 d0, d2 |
42 | vtrn.32 d1, d3 |
43 | vtrn.16 d0, d1 |
44 | vtrn.16 d2, d3 |
46 | vadd.i16 d0, d0, d16 |
48 | vadd.i16 d4, d0, d3 |
49 | vadd.i16 d6, d1, d2 |
50 | vsub.i16 d7, d1, d2 |
51 | vsub.i16 d5, d0, d3 |
52 | vadd.i16 q0, q2, q3 |
53 | vsub.i16 q1, q2, q3 |
55 | vshr.s16 q0, q0, #3 |
56 | vshr.s16 q1, q1, #3 |
58 | mov r3, #32 |
59 | vst1.16 {d0[0]}, [r0,:16], r3 |
60 | vst1.16 {d1[0]}, [r0,:16], r3 |
61 | vst1.16 {d2[0]}, [r0,:16], r3 |
62 | vst1.16 {d3[0]}, [r0,:16], r3 |
63 | vst1.16 {d0[1]}, [r0,:16], r3 |
64 | vst1.16 {d1[1]}, [r0,:16], r3 |
65 | vst1.16 {d2[1]}, [r0,:16], r3 |
66 | vst1.16 {d3[1]}, [r0,:16], r3 |
67 | vst1.16 {d0[2]}, [r0,:16], r3 |
68 | vst1.16 {d1[2]}, [r0,:16], r3 |
69 | vst1.16 {d2[2]}, [r0,:16], r3 |
70 | vst1.16 {d3[2]}, [r0,:16], r3 |
71 | vst1.16 {d0[3]}, [r0,:16], r3 |
72 | vst1.16 {d1[3]}, [r0,:16], r3 |
73 | vst1.16 {d2[3]}, [r0,:16], r3 |
74 | vst1.16 {d3[3]}, [r0,:16], r3 |
76 | bx lr |
77 | endfunc |
79 | function ff_vp8_luma_dc_wht_dc_neon, export=1 |
80 | ldrsh r2, [r1] |
81 | mov r3, #0 |
82 | add r2, r2, #3 |
83 | strh r3, [r1] |
84 | asr r2, r2, #3 |
85 | .rept 16 |
86 | strh r2, [r0], #32 |
87 | .endr |
88 | bx lr |
89 | endfunc |
91 | function ff_vp8_idct_add_neon, export=1 |
92 | vld1.16 {q0-q1}, [r1,:128] |
93 | movw r3, #20091 |
94 | movt r3, #35468/2 |
95 | vdup.32 d4, r3 |
97 | vmull.s16 q12, d1, d4[0] |
98 | vmull.s16 q13, d3, d4[0] |
99 | vqdmulh.s16 d20, d1, d4[1] |
100 | vqdmulh.s16 d23, d3, d4[1] |
101 | vshrn.s32 d21, q12, #16 |
102 | vshrn.s32 d22, q13, #16 |
103 | vadd.s16 d21, d21, d1 |
104 | vadd.s16 d22, d22, d3 |
106 | vadd.s16 d16, d0, d2 |
107 | vsub.s16 d17, d0, d2 |
108 | vadd.s16 d18, d21, d23 |
109 | vsub.s16 d19, d20, d22 |
110 | vadd.s16 q0, q8, q9 |
111 | vsub.s16 q1, q8, q9 |
113 | vtrn.32 d0, d3 |
114 | vtrn.32 d1, d2 |
115 | vtrn.16 d0, d1 |
116 | vtrn.16 d3, d2 |
118 | vmov.i16 q15, #0 |
119 | vmull.s16 q12, d1, d4[0] |
120 | vst1.16 {q15}, [r1,:128]! |
121 | vmull.s16 q13, d2, d4[0] |
122 | vst1.16 {q15}, [r1,:128] |
123 | vqdmulh.s16 d21, d1, d4[1] |
124 | vqdmulh.s16 d23, d2, d4[1] |
125 | vshrn.s32 d20, q12, #16 |
126 | vshrn.s32 d22, q13, #16 |
127 | vadd.i16 d20, d20, d1 |
128 | vadd.i16 d22, d22, d2 |
130 | vadd.i16 d16, d0, d3 |
131 | vsub.i16 d17, d0, d3 |
132 | vadd.i16 d18, d20, d23 |
133 | vld1.32 {d20[]}, [r0,:32], r2 |
134 | vsub.i16 d19, d21, d22 |
135 | vld1.32 {d22[]}, [r0,:32], r2 |
136 | vadd.s16 q0, q8, q9 |
137 | vld1.32 {d23[]}, [r0,:32], r2 |
138 | vsub.s16 q1, q8, q9 |
139 | vld1.32 {d21[]}, [r0,:32], r2 |
140 | vrshr.s16 q0, q0, #3 |
141 | vtrn.32 q10, q11 |
142 | vrshr.s16 q1, q1, #3 |
144 | sub r0, r0, r2, lsl #2 |
146 | vtrn.32 d0, d3 |
147 | vtrn.32 d1, d2 |
148 | vtrn.16 d0, d1 |
149 | vtrn.16 d3, d2 |
151 | vaddw.u8 q0, q0, d20 |
152 | vaddw.u8 q1, q1, d21 |
153 | vqmovun.s16 d0, q0 |
154 | vqmovun.s16 d1, q1 |
156 | vst1.32 {d0[0]}, [r0,:32], r2 |
157 | vst1.32 {d0[1]}, [r0,:32], r2 |
158 | vst1.32 {d1[1]}, [r0,:32], r2 |
159 | vst1.32 {d1[0]}, [r0,:32], r2 |
161 | bx lr |
162 | endfunc |
164 | function ff_vp8_idct_dc_add_neon, export=1 |
165 | mov r3, #0 |
166 | ldrsh r12, [r1] |
167 | strh r3, [r1] |
168 | vdup.16 q1, r12 |
169 | vrshr.s16 q1, q1, #3 |
170 | vld1.32 {d0[]}, [r0,:32], r2 |
171 | vld1.32 {d1[]}, [r0,:32], r2 |
172 | vld1.32 {d0[1]}, [r0,:32], r2 |
173 | vld1.32 {d1[1]}, [r0,:32], r2 |
174 | vaddw.u8 q2, q1, d0 |
175 | vaddw.u8 q3, q1, d1 |
176 | sub r0, r0, r2, lsl #2 |
177 | vqmovun.s16 d0, q2 |
178 | vqmovun.s16 d1, q3 |
179 | vst1.32 {d0[0]}, [r0,:32], r2 |
180 | vst1.32 {d1[0]}, [r0,:32], r2 |
181 | vst1.32 {d0[1]}, [r0,:32], r2 |
182 | vst1.32 {d1[1]}, [r0,:32], r2 |
183 | bx lr |
184 | endfunc |
186 | function ff_vp8_idct_dc_add4uv_neon, export=1 |
187 | vmov.i16 d0, #0 |
188 | mov r3, #32 |
189 | vld1.16 {d16[]}, [r1,:16] |
190 | vst1.16 {d0[0]}, [r1,:16], r3 |
191 | vld1.16 {d17[]}, [r1,:16] |
192 | vst1.16 {d0[0]}, [r1,:16], r3 |
193 | vld1.16 {d18[]}, [r1,:16] |
194 | vst1.16 {d0[0]}, [r1,:16], r3 |
195 | vld1.16 {d19[]}, [r1,:16] |
196 | vst1.16 {d0[0]}, [r1,:16], r3 |
197 | mov r3, r0 |
198 | vrshr.s16 q8, q8, #3 @ dc >>= 3 |
199 | vld1.8 {d0}, [r0,:64], r2 |
200 | vrshr.s16 q9, q9, #3 |
201 | vld1.8 {d1}, [r0,:64], r2 |
202 | vaddw.u8 q10, q8, d0 |
203 | vld1.8 {d2}, [r0,:64], r2 |
204 | vaddw.u8 q0, q8, d1 |
205 | vld1.8 {d3}, [r0,:64], r2 |
206 | vaddw.u8 q11, q8, d2 |
207 | vld1.8 {d4}, [r0,:64], r2 |
208 | vaddw.u8 q1, q8, d3 |
209 | vld1.8 {d5}, [r0,:64], r2 |
210 | vaddw.u8 q12, q9, d4 |
211 | vld1.8 {d6}, [r0,:64], r2 |
212 | vaddw.u8 q2, q9, d5 |
213 | vld1.8 {d7}, [r0,:64], r2 |
214 | vaddw.u8 q13, q9, d6 |
215 | vqmovun.s16 d20, q10 |
216 | vaddw.u8 q3, q9, d7 |
217 | vqmovun.s16 d21, q0 |
218 | vqmovun.s16 d22, q11 |
219 | vst1.8 {d20}, [r3,:64], r2 |
220 | vqmovun.s16 d23, q1 |
221 | vst1.8 {d21}, [r3,:64], r2 |
222 | vqmovun.s16 d24, q12 |
223 | vst1.8 {d22}, [r3,:64], r2 |
224 | vqmovun.s16 d25, q2 |
225 | vst1.8 {d23}, [r3,:64], r2 |
226 | vqmovun.s16 d26, q13 |
227 | vst1.8 {d24}, [r3,:64], r2 |
228 | vqmovun.s16 d27, q3 |
229 | vst1.8 {d25}, [r3,:64], r2 |
230 | vst1.8 {d26}, [r3,:64], r2 |
231 | vst1.8 {d27}, [r3,:64], r2 |
233 | bx lr |
234 | endfunc |
236 | function ff_vp8_idct_dc_add4y_neon, export=1 |
237 | vmov.i16 d0, #0 |
238 | mov r3, #32 |
239 | vld1.16 {d16[]}, [r1,:16] |
240 | vst1.16 {d0[0]}, [r1,:16], r3 |
241 | vld1.16 {d17[]}, [r1,:16] |
242 | vst1.16 {d0[0]}, [r1,:16], r3 |
243 | vld1.16 {d18[]}, [r1,:16] |
244 | vst1.16 {d0[0]}, [r1,:16], r3 |
245 | vld1.16 {d19[]}, [r1,:16] |
246 | vst1.16 {d0[0]}, [r1,:16], r3 |
247 | vrshr.s16 q8, q8, #3 @ dc >>= 3 |
248 | vld1.8 {q0}, [r0,:128], r2 |
249 | vrshr.s16 q9, q9, #3 |
250 | vld1.8 {q1}, [r0,:128], r2 |
251 | vaddw.u8 q10, q8, d0 |
252 | vld1.8 {q2}, [r0,:128], r2 |
253 | vaddw.u8 q0, q9, d1 |
254 | vld1.8 {q3}, [r0,:128], r2 |
255 | vaddw.u8 q11, q8, d2 |
256 | vaddw.u8 q1, q9, d3 |
257 | vaddw.u8 q12, q8, d4 |
258 | vaddw.u8 q2, q9, d5 |
259 | vaddw.u8 q13, q8, d6 |
260 | vaddw.u8 q3, q9, d7 |
261 | sub r0, r0, r2, lsl #2 |
262 | vqmovun.s16 d20, q10 |
263 | vqmovun.s16 d21, q0 |
264 | vqmovun.s16 d22, q11 |
265 | vqmovun.s16 d23, q1 |
266 | vqmovun.s16 d24, q12 |
267 | vst1.8 {q10}, [r0,:128], r2 |
268 | vqmovun.s16 d25, q2 |
269 | vst1.8 {q11}, [r0,:128], r2 |
270 | vqmovun.s16 d26, q13 |
271 | vst1.8 {q12}, [r0,:128], r2 |
272 | vqmovun.s16 d27, q3 |
273 | vst1.8 {q13}, [r0,:128], r2 |
275 | bx lr |
276 | endfunc |
278 | @ Register layout: |
279 | @ P3..Q3 -> q0..q7 |
280 | @ flim_E -> q14 |
281 | @ flim_I -> q15 |
282 | @ hev_thresh -> r12 |
283 | @ |
284 | .macro vp8_loop_filter, inner=0, simple=0 |
285 | .if \simple |
286 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) |
287 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) |
288 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |
289 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |
290 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |
291 | vmov.i8 q13, #0x80 |
292 | vcle.u8 q8, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim |
293 | .else |
294 | @ calculate hev and normal_limit: |
295 | vabd.u8 q12, q2, q3 @ abs(P1-P0) |
296 | vabd.u8 q13, q5, q4 @ abs(Q1-Q0) |
297 | vabd.u8 q10, q0, q1 @ abs(P3-P2) |
298 | vabd.u8 q11, q1, q2 @ abs(P2-P1) |
299 | vcle.u8 q8, q12, q15 @ abs(P1-P0) <= flim_I |
300 | vcle.u8 q9, q13, q15 @ abs(Q1-Q0) <= flim_I |
301 | vcle.u8 q10, q10, q15 @ abs(P3-P2) <= flim_I |
302 | vcle.u8 q11, q11, q15 @ abs(P2-P1) <= flim_I |
303 | vand q8, q8, q9 |
304 | vabd.u8 q9, q7, q6 @ abs(Q3-Q2) |
305 | vand q8, q8, q11 |
306 | vabd.u8 q11, q6, q5 @ abs(Q2-Q1) |
307 | vand q8, q8, q10 |
308 | vcle.u8 q10, q9, q15 @ abs(Q3-Q2) <= flim_I |
309 | vcle.u8 q11, q11, q15 @ abs(Q2-Q1) <= flim_I |
310 | vabd.u8 q9, q3, q4 @ abs(P0-Q0) |
311 | vabd.u8 q15, q2, q5 @ abs(P1-Q1) |
312 | vand q8, q8, q10 |
313 | vqadd.u8 q9, q9, q9 @ abs(P0-Q0) * 2 |
314 | vand q8, q8, q11 |
315 | vshr.u8 q10, q15, #1 @ abs(P1-Q1) / 2 |
316 | vdup.8 q15, r12 @ hev_thresh |
317 | vqadd.u8 q11, q9, q10 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) |
318 | vcgt.u8 q12, q12, q15 @ abs(P1-P0) > hev_thresh |
319 | vcle.u8 q11, q11, q14 @ (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E |
320 | vcgt.u8 q14, q13, q15 @ abs(Q1-Q0) > hev_thresh |
321 | vand q8, q8, q11 |
322 | vmov.i8 q13, #0x80 |
323 | vorr q9, q12, q14 |
324 | .endif |
326 | @ at this point: |
327 | @ q8: normal_limit |
328 | @ q9: hev |
330 | @ convert to signed value: |
331 | veor q3, q3, q13 @ PS0 = P0 ^ 0x80 |
332 | veor q4, q4, q13 @ QS0 = Q0 ^ 0x80 |
334 | vmov.i16 q12, #3 |
335 | vsubl.s8 q10, d8, d6 @ QS0 - PS0 |
336 | vsubl.s8 q11, d9, d7 @ (widened to 16bit) |
337 | veor q2, q2, q13 @ PS1 = P1 ^ 0x80 |
338 | veor q5, q5, q13 @ QS1 = Q1 ^ 0x80 |
339 | vmul.i16 q10, q10, q12 @ w = 3 * (QS0 - PS0) |
340 | vmul.i16 q11, q11, q12 |
342 | vqsub.s8 q12, q2, q5 @ clamp(PS1-QS1) |
343 | vmov.i8 q14, #4 |
344 | vmov.i8 q15, #3 |
345 | .if \inner |
346 | vand q12, q12, q9 @ if(hev) w += clamp(PS1-QS1) |
347 | .endif |
348 | vaddw.s8 q10, q10, d24 @ w += clamp(PS1-QS1) |
349 | vaddw.s8 q11, q11, d25 |
350 | vqmovn.s16 d20, q10 @ narrow result back into q10 |
351 | vqmovn.s16 d21, q11 |
352 | .if !\inner && !\simple |
353 | veor q1, q1, q13 @ PS2 = P2 ^ 0x80 |
354 | veor q6, q6, q13 @ QS2 = Q2 ^ 0x80 |
355 | .endif |
356 | vand q10, q10, q8 @ w &= normal_limit |
358 | @ registers used at this point.. |
359 | @ q0 -> P3 (don't corrupt) |
360 | @ q1-q6 -> PS2-QS2 |
361 | @ q7 -> Q3 (don't corrupt) |
362 | @ q9 -> hev |
363 | @ q10 -> w |
364 | @ q13 -> #0x80 |
365 | @ q14 -> #4 |
366 | @ q15 -> #3 |
367 | @ q8, q11, q12 -> unused |
369 | @ filter_common: is4tap==1 |
370 | @ c1 = clamp(w + 4) >> 3; |
371 | @ c2 = clamp(w + 3) >> 3; |
372 | @ Q0 = s2u(QS0 - c1); |
373 | @ P0 = s2u(PS0 + c2); |
375 | .if \simple |
376 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |
377 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |
378 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
379 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
380 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
381 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
382 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
383 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
384 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
385 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
386 | .elseif \inner |
387 | @ the !is4tap case of filter_common, only used for inner blocks |
388 | @ c3 = ((c1&~hev) + 1) >> 1; |
389 | @ Q1 = s2u(QS1 - c3); |
390 | @ P1 = s2u(PS1 + c3); |
391 | vqadd.s8 q11, q10, q14 @ c1 = clamp((w&hev)+4) |
392 | vqadd.s8 q12, q10, q15 @ c2 = clamp((w&hev)+3) |
393 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
394 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
395 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
396 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
397 | vbic q11, q11, q9 @ c1 & ~hev |
398 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
399 | vrshr.s8 q11, q11, #1 @ c3 >>= 1 |
400 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
401 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-c3) |
402 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+c3) |
403 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
404 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
405 | .else |
406 | vand q12, q10, q9 @ w & hev |
407 | vqadd.s8 q11, q12, q14 @ c1 = clamp((w&hev)+4) |
408 | vqadd.s8 q12, q12, q15 @ c2 = clamp((w&hev)+3) |
409 | vshr.s8 q11, q11, #3 @ c1 >>= 3 |
410 | vshr.s8 q12, q12, #3 @ c2 >>= 3 |
411 | vbic q10, q10, q9 @ w &= ~hev |
412 | vqsub.s8 q4, q4, q11 @ QS0 = clamp(QS0-c1) |
413 | vqadd.s8 q3, q3, q12 @ PS0 = clamp(PS0+c2) |
415 | @ filter_mbedge: |
416 | @ a = clamp((27*w + 63) >> 7); |
417 | @ Q0 = s2u(QS0 - a); |
418 | @ P0 = s2u(PS0 + a); |
419 | @ a = clamp((18*w + 63) >> 7); |
420 | @ Q1 = s2u(QS1 - a); |
421 | @ P1 = s2u(PS1 + a); |
422 | @ a = clamp((9*w + 63) >> 7); |
423 | @ Q2 = s2u(QS2 - a); |
424 | @ P2 = s2u(PS2 + a); |
425 | vmov.i16 q9, #63 |
426 | vshll.s8 q14, d20, #3 |
427 | vshll.s8 q15, d21, #3 |
428 | vaddw.s8 q14, q14, d20 |
429 | vaddw.s8 q15, q15, d21 |
430 | vadd.s16 q8, q9, q14 |
431 | vadd.s16 q9, q9, q15 @ 9*w + 63 |
432 | vadd.s16 q11, q8, q14 |
433 | vadd.s16 q12, q9, q15 @ 18*w + 63 |
434 | vadd.s16 q14, q11, q14 |
435 | vadd.s16 q15, q12, q15 @ 27*w + 63 |
436 | vqshrn.s16 d16, q8, #7 |
437 | vqshrn.s16 d17, q9, #7 @ clamp(( 9*w + 63)>>7) |
438 | vqshrn.s16 d22, q11, #7 |
439 | vqshrn.s16 d23, q12, #7 @ clamp((18*w + 63)>>7) |
440 | vqshrn.s16 d28, q14, #7 |
441 | vqshrn.s16 d29, q15, #7 @ clamp((27*w + 63)>>7) |
442 | vqadd.s8 q1, q1, q8 @ PS2 = clamp(PS2+a) |
443 | vqsub.s8 q6, q6, q8 @ QS2 = clamp(QS2-a) |
444 | vqadd.s8 q2, q2, q11 @ PS1 = clamp(PS1+a) |
445 | vqsub.s8 q5, q5, q11 @ QS1 = clamp(QS1-a) |
446 | vqadd.s8 q3, q3, q14 @ PS0 = clamp(PS0+a) |
447 | vqsub.s8 q4, q4, q14 @ QS0 = clamp(QS0-a) |
448 | veor q3, q3, q13 @ P0 = PS0 ^ 0x80 |
449 | veor q4, q4, q13 @ Q0 = QS0 ^ 0x80 |
450 | veor q2, q2, q13 @ P1 = PS1 ^ 0x80 |
451 | veor q5, q5, q13 @ Q1 = QS1 ^ 0x80 |
452 | veor q1, q1, q13 @ P2 = PS2 ^ 0x80 |
453 | veor q6, q6, q13 @ Q2 = QS2 ^ 0x80 |
454 | .endif |
455 | .endm |
457 | .macro transpose8x16matrix |
458 | vtrn.32 q0, q4 |
459 | vtrn.32 q1, q5 |
460 | vtrn.32 q2, q6 |
461 | vtrn.32 q3, q7 |
463 | vtrn.16 q0, q2 |
464 | vtrn.16 q1, q3 |
465 | vtrn.16 q4, q6 |
466 | vtrn.16 q5, q7 |
468 | vtrn.8 q0, q1 |
469 | vtrn.8 q2, q3 |
470 | vtrn.8 q4, q5 |
471 | vtrn.8 q6, q7 |
472 | .endm |
474 | .macro vp8_v_loop_filter16 name, inner=0, simple=0 |
475 | function ff_vp8_v_loop_filter16\name\()_neon, export=1 |
476 | vpush {q4-q7} |
477 | sub r0, r0, r1, lsl #1+!\simple |
478 | |||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

||

496 | vp8_loop_filter inner=\inner, simple=\simple |
497 | |||

||

||

||

||

503 | @ Store pixels: |
504 | vst1.8 {q1}, [r0,:128], r1 @ P2 |
505 | .endif |
506 | vst1.8 {q2}, [r0,:128], r1 @ P1 |
507 | vst1.8 {q3}, [r0,:128], r1 @ P0 |
508 | vst1.8 {q4}, [r0,:128], r1 @ Q0 |
509 | vst1.8 {q5}, [r0,:128], r1 @ Q1 |
510 | .if !\simple |
511 | vst1.8 {q6}, [r0,:128] @ Q2 |
512 | .endif |
514 | vpop {q4-q7} |
515 | bx lr |
516 | endfunc |
||

519 | vp8_v_loop_filter16 |
520 | vp8_v_loop_filter16 _inner, inner=1 |
521 | vp8_v_loop_filter16 _simple, simple=1 |
523 | .macro vp8_v_loop_filter8uv name, inner=0 |
524 | function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 |
525 | vpush {q4-q7} |
526 | sub r0, r0, r2, lsl #2 |
527 | sub r1, r1, r2, lsl #2 |
528 | ldr r12, [sp, #64] @ flim_I |
530 | @ Load pixels: |
531 | vld1.8 {d0}, [r0,:64], r2 @ P3 |
532 | vld1.8 {d1}, [r1,:64], r2 @ P3 |
533 | vld1.8 {d2}, [r0,:64], r2 @ P2 |
534 | vld1.8 {d3}, [r1,:64], r2 @ P2 |
535 | vld1.8 {d4}, [r0,:64], r2 @ P1 |
536 | vld1.8 {d5}, [r1,:64], r2 @ P1 |
537 | vld1.8 {d6}, [r0,:64], r2 @ P0 |
538 | vld1.8 {d7}, [r1,:64], r2 @ P0 |
539 | vld1.8 {d8}, [r0,:64], r2 @ Q0 |
540 | vld1.8 {d9}, [r1,:64], r2 @ Q0 |
541 | vld1.8 {d10}, [r0,:64], r2 @ Q1 |
542 | vld1.8 {d11}, [r1,:64], r2 @ Q1 |
543 | vld1.8 {d12}, [r0,:64], r2 @ Q2 |
544 | vld1.8 {d13}, [r1,:64], r2 @ Q2 |
545 | vld1.8 {d14}, [r0,:64] @ Q3 |
546 | vld1.8 {d15}, [r1,:64] @ Q3 |
548 | vdup.8 q14, r3 @ flim_E |
549 | vdup.8 q15, r12 @ flim_I |
550 | ldr r12, [sp, #68] @ hev_thresh |
552 | vp8_loop_filter inner=\inner |
554 | @ back up to P2: u,v -= stride * 6 |
555 | sub r0, r0, r2, lsl #2 |
556 | sub r1, r1, r2, lsl #2 |
557 | sub r0, r0, r2, lsl #1 |
558 | sub r1, r1, r2, lsl #1 |
559 | |||

561 | vst1.8 {d2}, [r0,:64], r2 @ P2 |
562 | vst1.8 {d3}, [r1,:64], r2 @ P2 |
563 | vst1.8 {d4}, [r0,:64], r2 @ P1 |
564 | vst1.8 {d5}, [r1,:64], r2 @ P1 |
565 | vst1.8 {d6}, [r0,:64], r2 @ P0 |
566 | vst1.8 {d7}, [r1,:64], r2 @ P0 |
567 | vst1.8 {d8}, [r0,:64], r2 @ Q0 |
568 | vst1.8 {d9}, [r1,:64], r2 @ Q0 |
569 | vst1.8 {d10}, [r0,:64], r2 @ Q1 |
570 | vst1.8 {d11}, [r1,:64], r2 @ Q1 |
571 | vst1.8 {d12}, [r0,:64] @ Q2 |
572 | vst1.8 {d13}, [r1,:64] @ Q2 |
574 | vpop {q4-q7} |
575 | bx lr |
576 | endfunc |
||

579 | vp8_v_loop_filter8uv |
580 | vp8_v_loop_filter8uv _inner, inner=1 |
582 | .macro vp8_h_loop_filter16 name, inner=0, simple=0 |
583 | function ff_vp8_h_loop_filter16\name\()_neon, export=1 |
584 | vpush {q4-q7} |
585 | sub r0, r0, #4 |
586 | .if !\simple |
587 | ldr r12, [sp, #64] @ hev_thresh |
588 | .endif |
590 | @ Load pixels: |
591 | vld1.8 {d0}, [r0], r1 @ load first 8-line src data |
592 | vld1.8 {d2}, [r0], r1 |
593 | vld1.8 {d4}, [r0], r1 |
594 | vld1.8 {d6}, [r0], r1 |
595 | vld1.8 {d8}, [r0], r1 |
596 | vld1.8 {d10}, [r0], r1 |
597 | vld1.8 {d12}, [r0], r1 |
598 | vld1.8 {d14}, [r0], r1 |
599 | vld1.8 {d1}, [r0], r1 @ load second 8-line src data |
600 | vld1.8 {d3}, [r0], r1 |
601 | vld1.8 {d5}, [r0], r1 |
602 | vld1.8 {d7}, [r0], r1 |
603 | vld1.8 {d9}, [r0], r1 |
604 | vld1.8 {d11}, [r0], r1 |
605 | vld1.8 {d13}, [r0], r1 |
606 | vld1.8 {d15}, [r0], r1 |
608 | transpose8x16matrix |
610 | vdup.8 q14, r2 @ flim_E |
611 | .if !\simple |
612 | vdup.8 q15, r3 @ flim_I |
613 | .endif |
615 | vp8_loop_filter inner=\inner, simple=\simple |
617 | sub r0, r0, r1, lsl #4 @ backup 16 rows |
618 | |||

619 | transpose8x16matrix |
||

620 | |||

621 | @ Store pixels: |
||

622 | vst1.8 {d0}, [r0], r1 |
||

623 | vst1.8 {d2}, [r0], r1 |
||

624 | vst1.8 {d4}, [r0], r1 |
||

625 | vst1.8 {d6}, [r0], r1 |
||

626 | vst1.8 {d8}, [r0], r1 |
||

627 | vst1.8 {d10}, [r0], r1 |
||

628 | vst1.8 {d12}, [r0], r1 |
||

629 | vst1.8 {d14}, [r0], r1 |
||

630 | vst1.8 {d1}, [r0], r1 |
||

631 | vst1.8 {d3}, [r0], r1 |
||

632 | vst1.8 {d5}, [r0], r1 |
||

633 | vst1.8 {d7}, [r0], r1 |
||

634 | vst1.8 {d9}, [r0], r1 |
||

635 | vst1.8 {d11}, [r0], r1 |
||

636 | vst1.8 {d13}, [r0], r1 |
||

637 | vst1.8 {d15}, [r0] |
||

638 | |||

639 | vpop {q4-q7} |
||

640 | bx lr |
||

641 | endfunc |
||

642 | .endm |
||

643 | |||

644 | vp8_h_loop_filter16 |
||

645 | vp8_h_loop_filter16 _inner, inner=1 |
||

646 | vp8_h_loop_filter16 _simple, simple=1 |
||

647 | |||

648 | .macro vp8_h_loop_filter8uv name, inner=0 |
||

649 | function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 |
||

650 | vpush {q4-q7} |
||

651 | sub r0, r0, #4 |
||

652 | sub r1, r1, #4 |
||

653 | ldr r12, [sp, #64] @ flim_I |
||

654 | |||

655 | @ Load pixels: |
||

656 | vld1.8 {d0}, [r0], r2 @ load u |
||

657 | vld1.8 {d1}, [r1], r2 @ load v |
||

658 | vld1.8 {d2}, [r0], r2 |
||

659 | vld1.8 {d3}, [r1], r2 |
||

660 | vld1.8 {d4}, [r0], r2 |
||

661 | vld1.8 {d5}, [r1], r2 |
||

662 | vld1.8 {d6}, [r0], r2 |
||

663 | vld1.8 {d7}, [r1], r2 |
||

664 | vld1.8 {d8}, [r0], r2 |
||

665 | vld1.8 {d9}, [r1], r2 |
||

666 | vld1.8 {d10}, [r0], r2 |
||

667 | vld1.8 {d11}, [r1], r2 |
||

668 | vld1.8 {d12}, [r0], r2 |
||

669 | vld1.8 {d13}, [r1], r2 |
||

670 | vld1.8 {d14}, [r0], r2 |
||

671 | vld1.8 {d15}, [r1], r2 |
||

672 | |||

673 | transpose8x16matrix |
||

674 | |||

675 | vdup.8 q14, r3 @ flim_E |
||

676 | vdup.8 q15, r12 @ flim_I |
||

677 | ldr r12, [sp, #68] @ hev_thresh |
||

678 | |||

679 | vp8_loop_filter inner=\inner |
||

680 | |||

681 | sub r0, r0, r2, lsl #3 @ backup u 8 rows |
||

682 | sub r1, r1, r2, lsl #3 @ backup v 8 rows |
||

683 | |||

684 | transpose8x16matrix |
||

685 | |||

686 | @ Store pixels: |
||

687 | vst1.8 {d0}, [r0], r2 |
||

688 | vst1.8 {d1}, [r1], r2 |
||

689 | vst1.8 {d2}, [r0], r2 |
||

690 | vst1.8 {d3}, [r1], r2 |
||

691 | vst1.8 {d4}, [r0], r2 |
||

692 | vst1.8 {d5}, [r1], r2 |
||

693 | vst1.8 {d6}, [r0], r2 |
||

694 | vst1.8 {d7}, [r1], r2 |
||

695 | vst1.8 {d8}, [r0], r2 |
||

696 | vst1.8 {d9}, [r1], r2 |
||

697 | vst1.8 {d10}, [r0], r2 |
||

698 | vst1.8 {d11}, [r1], r2 |
||

699 | vst1.8 {d12}, [r0], r2 |
||

700 | vst1.8 {d13}, [r1], r2 |
||

701 | vst1.8 {d14}, [r0] |
||

702 | vst1.8 {d15}, [r1] |
||

703 | |||

704 | vpop {q4-q7} |
||

705 | bx lr |
||

706 | endfunc |
||

707 | .endm |
||

708 | |||

709 | vp8_h_loop_filter8uv |
||

710 | vp8_h_loop_filter8uv _inner, inner=1 |
||

711 | |||

712 | function ff_put_vp8_pixels16_neon, export=1 |
||

713 | ldr r12, [sp, #0] @ h |
||

714 | 1: |
||

715 | subs r12, r12, #4 |
||

716 | vld1.8 {q0}, [r2], r3 |
||

717 | vld1.8 {q1}, [r2], r3 |
||

718 | vld1.8 {q2}, [r2], r3 |
||

719 | vld1.8 {q3}, [r2], r3 |
||

720 | vst1.8 {q0}, [r0,:128], r1 |
||

721 | vst1.8 {q1}, [r0,:128], r1 |
||

722 | vst1.8 {q2}, [r0,:128], r1 |
||

723 | vst1.8 {q3}, [r0,:128], r1 |
||

724 | bgt 1b |
||

725 | bx lr |
||

726 | endfunc |
||

727 | |||

728 | function ff_put_vp8_pixels8_neon, export=1 |
||

729 | ldr r12, [sp, #0] @ h |
||

730 | 1: |
||

731 | subs r12, r12, #4 |
||

732 | vld1.8 {d0}, [r2], r3 |
||

733 | vld1.8 {d1}, [r2], r3 |
||

734 | vld1.8 {d2}, [r2], r3 |
||

735 | vld1.8 {d3}, [r2], r3 |
||

736 | vst1.8 {d0}, [r0,:64], r1 |
||

737 | vst1.8 {d1}, [r0,:64], r1 |
||

738 | vst1.8 {d2}, [r0,:64], r1 |
||

739 | vst1.8 {d3}, [r0,:64], r1 |
||

740 | bgt 1b |
||

741 | bx lr |
||

742 | endfunc |
||

743 | |||

744 | function ff_put_vp8_pixels4_neon, export=1 |
||

745 | ldr r12, [sp, #0] @ h |
||

746 | push {r4-r6,lr} |
||

747 | 1: |
||

748 | subs r12, r12, #4 |
||

749 | ldr r4, [r2], r3 |
||

750 | ldr r5, [r2], r3 |
||

751 | ldr r6, [r2], r3 |
||

752 | ldr lr, [r2], r3 |
||

753 | str r4, [r0], r1 |
||

754 | str r5, [r0], r1 |
||

755 | str r6, [r0], r1 |
||

756 | str lr, [r0], r1 |
||

757 | bgt 1b |
||

758 | pop {r4-r6,pc} |
||

759 | endfunc |
||

760 | |||

761 | /* 4/6-tap 8th-pel MC */ |
||

762 | |||

763 | .macro vp8_epel8_h6 d, a, b |
||

764 | vext.8 d27, \a, \b, #1 |
||

765 | vmovl.u8 q8, \a |
||

766 | vext.8 d28, \a, \b, #2 |
||

767 | vmovl.u8 q9, d27 |
||

768 | vext.8 d29, \a, \b, #3 |
||

769 | vmovl.u8 q10, d28 |
||

770 | vext.8 d30, \a, \b, #4 |
||

771 | vmovl.u8 q11, d29 |
||

772 | vext.8 d31, \a, \b, #5 |
||

773 | vmovl.u8 q12, d30 |
||

774 | vmul.u16 q10, q10, d0[2] |
||

775 | vmovl.u8 q13, d31 |
||

776 | vmul.u16 q11, q11, d0[3] |
||

777 | vmls.u16 q10, q9, d0[1] |
||

778 | vmls.u16 q11, q12, d1[0] |
||

779 | vmla.u16 q10, q8, d0[0] |
||

780 | vmla.u16 q11, q13, d1[1] |
||

781 | vqadd.s16 q11, q10, q11 |
||

782 | vqrshrun.s16 \d, q11, #7 |
||

783 | .endm |
||

784 | |||

785 | .macro vp8_epel16_h6 d0, d1, s0, s1, s2, q0, q1 |
||

786 | vext.8 q14, \q0, \q1, #3 |
||

787 | vext.8 q15, \q0, \q1, #4 |
||

788 | vmovl.u8 q11, d28 |
||

789 | vmovl.u8 q14, d29 |
||

790 | vext.8 q3, \q0, \q1, #2 |
||

791 | vmovl.u8 q12, d30 |
||

792 | vmovl.u8 q15, d31 |
||

793 | vext.8 q8, \q0, \q1, #1 |
||

794 | vmovl.u8 q10, d6 |
||

795 | vmovl.u8 q3, d7 |
||

796 | vext.8 q2, \q0, \q1, #5 |
||

797 | vmovl.u8 q13, d4 |
||

798 | vmovl.u8 q2, d5 |
||

799 | vmovl.u8 q9, d16 |
||

800 | vmovl.u8 q8, d17 |
||

801 | vmul.u16 q11, q11, d0[3] |
||

802 | vmul.u16 q10, q10, d0[2] |
||

803 | vmul.u16 q3, q3, d0[2] |
||

804 | vmul.u16 q14, q14, d0[3] |
||

805 | vmls.u16 q11, q12, d1[0] |
||

806 | vmovl.u8 q12, \s0 |
||

807 | vmovl.u8 q1, \s1 |
||

808 | vmls.u16 q10, q9, d0[1] |
||

809 | vmls.u16 q3, q8, d0[1] |
||

810 | vmls.u16 q14, q15, d1[0] |
||

811 | vmla.u16 q10, q12, d0[0] |
||

812 | vmla.u16 q11, q13, d1[1] |
||

813 | vmla.u16 q3, q1, d0[0] |
||

814 | vmla.u16 q14, q2, d1[1] |
||

815 | vqadd.s16 q11, q10, q11 |
||

816 | vqadd.s16 q14, q3, q14 |
||

817 | vqrshrun.s16 \d0, q11, #7 |
||

818 | vqrshrun.s16 \d1, q14, #7 |
||

819 | .endm |
||

820 | |||

821 | .macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 |
||

822 | vmovl.u8 q10, \s2 |
||

823 | vmovl.u8 q11, \s3 |
||

824 | vmovl.u8 q9, \s1 |
||

825 | vmovl.u8 q12, \s4 |
||

826 | vmovl.u8 q8, \s0 |
||

827 | vmovl.u8 q13, \s5 |
||

828 | vmul.u16 q10, q10, d0[2] |
||

829 | vmul.u16 q11, q11, d0[3] |
||

830 | vmls.u16 q10, q9, d0[1] |
||

831 | vmls.u16 q11, q12, d1[0] |
||

832 | vmla.u16 q10, q8, d0[0] |
||

833 | vmla.u16 q11, q13, d1[1] |
||

834 | vqadd.s16 q11, q10, q11 |
||

835 | vqrshrun.s16 \d0, q11, #7 |
||

836 | .endm |
||

837 | |||

838 | .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 |
||

839 | vmovl.u8 q10, \s0 |
||

840 | vmovl.u8 q11, \s3 |
||

841 | vmovl.u8 q14, \s6 |
||

842 | vmovl.u8 q9, \s1 |
||

843 | vmovl.u8 q12, \s4 |
||

844 | vmovl.u8 q8, \s2 |
||

845 | vmovl.u8 q13, \s5 |
||

846 | vmul.u16 q10, q10, d0[0] |
||

847 | vmul.u16 q15, q11, d0[3] |
||

848 | vmul.u16 q11, q11, d0[2] |
||

849 | vmul.u16 q14, q14, d1[1] |
||

850 | vmls.u16 q10, q9, d0[1] |
||

851 | vmls.u16 q15, q12, d1[0] |
||

852 | vmls.u16 q11, q8, d0[1] |
||

853 | vmls.u16 q14, q13, d1[0] |
||

854 | vmla.u16 q10, q8, d0[2] |
||

855 | vmla.u16 q15, q13, d1[1] |
||

856 | vmla.u16 q11, q9, d0[0] |
||

857 | vmla.u16 q14, q12, d0[3] |
||

858 | vqadd.s16 q15, q10, q15 |
||

859 | vqadd.s16 q14, q11, q14 |
||

860 | vqrshrun.s16 \d0, q15, #7 |
||

861 | vqrshrun.s16 \d1, q14, #7 |
||

862 | .endm |
||

863 | |||

864 | .macro vp8_epel8_h4 d, a, b |
||

865 | vext.8 d28, \a, \b, #1 |
||

866 | vmovl.u8 q9, \a |
||

867 | vext.8 d29, \a, \b, #2 |
||

868 | vmovl.u8 q10, d28 |
||

869 | vext.8 d30, \a, \b, #3 |
||

870 | vmovl.u8 q11, d29 |
||

871 | vmovl.u8 q12, d30 |
||

872 | vmul.u16 q10, q10, d0[2] |
||

873 | vmul.u16 q11, q11, d0[3] |
||

874 | vmls.u16 q10, q9, d0[1] |
||

875 | vmls.u16 q11, q12, d1[0] |
||

876 | vqadd.s16 q11, q10, q11 |
||

877 | vqrshrun.s16 \d, q11, #7 |
||

878 | .endm |
||

879 | |||

880 | .macro vp8_epel8_v4_y2 d0, d1, s0, s1, s2, s3, s4 |
||

881 | vmovl.u8 q9, \s0 |
||

882 | vmovl.u8 q10, \s1 |
||

883 | vmovl.u8 q11, \s2 |
||

884 | vmovl.u8 q12, \s3 |
||

885 | vmovl.u8 q13, \s4 |
||

886 | vmul.u16 q8, q10, d0[2] |
||

887 | vmul.u16 q14, q11, d0[3] |
||

888 | vmul.u16 q11, q11, d0[2] |
||

889 | vmul.u16 q15, q12, d0[3] |
||

890 | vmls.u16 q8, q9, d0[1] |
||

891 | vmls.u16 q14, q12, d1[0] |
||

892 | vmls.u16 q11, q10, d0[1] |
||

893 | vmls.u16 q15, q13, d1[0] |
||

894 | vqadd.s16 q8, q8, q14 |
||

895 | vqadd.s16 q11, q11, q15 |
||

896 | vqrshrun.s16 \d0, q8, #7 |
||

897 | vqrshrun.s16 \d1, q11, #7 |
||

898 | .endm |
||

899 | |||

900 | function ff_put_vp8_epel16_v6_neon, export=1 |
||

901 | sub r2, r2, r3, lsl #1 |
||

902 | push {r4,lr} |
||

903 | vpush {d8-d15} |
||

904 | |||

905 | ldr r4, [sp, #80] @ my |
||

906 | movrel lr, subpel_filters-16 |
||

907 | ldr r12, [sp, #72] @ h |
||

908 | add r4, lr, r4, lsl #4 |
||

909 | vld1.16 {q0}, [r4,:128] |
||

910 | 1: |
||

911 | vld1.8 {d2-d3}, [r2], r3 |
||

912 | vld1.8 {d4-d5}, [r2], r3 |
||

913 | vld1.8 {d6-d7}, [r2], r3 |
||

914 | vld1.8 {d8-d9}, [r2], r3 |
||

915 | vld1.8 {d10-d11},[r2], r3 |
||

916 | vld1.8 {d12-d13},[r2], r3 |
||

917 | vld1.8 {d14-d15},[r2] |
||

918 | sub r2, r2, r3, lsl #2 |
||

919 | |||

920 | vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 |
||

921 | vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 |
||

922 | |||

923 | vst1.8 {d2-d3}, [r0,:128], r1 |
||

924 | vst1.8 {d4-d5}, [r0,:128], r1 |
||

925 | subs r12, r12, #2 |
||

926 | bne 1b |
||

927 | |||

928 | vpop {d8-d15} |
||

929 | pop {r4,pc} |
||

930 | endfunc |
||

931 | |||

932 | function ff_put_vp8_epel16_h6_neon, export=1 |
||

933 | sub r2, r2, #2 |
||

934 | push {r4,lr} |
||

935 | |||

936 | ldr r4, [sp, #12] @ mx |
||

937 | movrel lr, subpel_filters-16 |
||

938 | ldr r12, [sp, #8] @ h |
||

939 | add r4, lr, r4, lsl #4 |
||

940 | vld1.16 {q0}, [r4,:128] |
||

941 | 1: |
||

942 | vld1.8 {d2-d4}, [r2], r3 |
||

943 | |||

944 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |
||

945 | |||

946 | vst1.8 {d2-d3}, [r0,:128], r1 |
||

947 | subs r12, r12, #1 |
||

948 | bne 1b |
||

949 | |||

950 | pop {r4,pc} |
||

951 | endfunc |
||

952 | |||

953 | function ff_put_vp8_epel16_h6v6_neon, export=1 |
||

954 | sub r2, r2, r3, lsl #1 |
||

955 | sub r2, r2, #2 |
||

956 | push {r4,lr} |
||

957 | vpush {d8-d9} |
||

958 | |||

959 | @ first pass (horizontal): |
||

960 | ldr r4, [sp, #28] @ mx |
||

961 | movrel lr, subpel_filters-16 |
||

962 | ldr r12, [sp, #24] @ h |
||

963 | add r4, lr, r4, lsl #4 |
||

964 | sub sp, sp, #336+16 |
||

965 | vld1.16 {q0}, [r4,:128] |
||

966 | add lr, sp, #15 |
||

967 | add r12, r12, #5 |
||

968 | bic lr, lr, #15 |
||

969 | 1: |
||

970 | vld1.8 {d2,d3,d4}, [r2], r3 |
||

971 | |||

972 | vp8_epel16_h6 d2, d3, d2, d3, d4, q1, q2 |
||

973 | |||

974 | vst1.8 {d2-d3}, [lr,:128]! |
||

975 | subs r12, r12, #1 |
||

976 | bne 1b |
||

977 | |||

978 | @ second pass (vertical): |
||

979 | ldr r4, [sp, #336+16+32] @ my |
||

980 | movrel lr, subpel_filters-16 |
||

981 | ldr r12, [sp, #336+16+24] @ h |
||

982 | add r4, lr, r4, lsl #4 |
||

983 | add lr, sp, #15 |
||

984 | vld1.16 {q0}, [r4,:128] |
||

985 | bic lr, lr, #15 |
||

986 | 2: |
||

987 | vld1.8 {d2-d5}, [lr,:128]! |
||

988 | vld1.8 {d6-d9}, [lr,:128]! |
||

989 | vld1.8 {d28-d31},[lr,:128] |
||

990 | sub lr, lr, #48 |
||

991 | |||

992 | vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 |
||

993 | vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 |
||

994 | |||

995 | vst1.8 {d2-d3}, [r0,:128], r1 |
||

996 | subs r12, r12, #1 |
||

997 | bne 2b |
||

998 | |||

999 | add sp, sp, #336+16 |
||

1000 | vpop {d8-d9} |
||

1001 | pop {r4,pc} |
||

1002 | endfunc |
||

1003 | |||

1004 | function ff_put_vp8_epel8_v6_neon, export=1 |
||

1005 | sub r2, r2, r3, lsl #1 |
||

1006 | push {r4,lr} |
||

1007 | |||

1008 | ldr r4, [sp, #16] @ my |
||

1009 | movrel lr, subpel_filters-16 |
||

1010 | ldr r12, [sp, #8] @ h |
||

1011 | add r4, lr, r4, lsl #4 |
||

1012 | vld1.16 {q0}, [r4,:128] |
||

1013 | 1: |
||

1014 | vld1.8 {d2}, [r2], r3 |
||

1015 | vld1.8 {d3}, [r2], r3 |
||

1016 | vld1.8 {d4}, [r2], r3 |
||

1017 | vld1.8 {d5}, [r2], r3 |
||

1018 | vld1.8 {d6}, [r2], r3 |
||

1019 | vld1.8 {d7}, [r2], r3 |
||

1020 | vld1.8 {d28}, [r2] |
||

1021 | |||

1022 | sub r2, r2, r3, lsl #2 |
||

1023 | |||

1024 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |
||

1025 | |||

1026 | vst1.8 {d2}, [r0,:64], r1 |
||

1027 | vst1.8 {d3}, [r0,:64], r1 |
||

1028 | subs r12, r12, #2 |
||

1029 | bne 1b |
||

1030 | |||

1031 | pop {r4,pc} |
||

1032 | endfunc |
||

1033 | |||

1034 | function ff_put_vp8_epel8_h6_neon, export=1 |
||

1035 | sub r2, r2, #2 |
||

1036 | push {r4,lr} |
||

1037 | |||

1038 | ldr r4, [sp, #12] @ mx |
||

1039 | movrel lr, subpel_filters-16 |
||

1040 | ldr r12, [sp, #8] @ h |
||

1041 | add r4, lr, r4, lsl #4 |
||

1042 | vld1.16 {q0}, [r4,:128] |
||

1043 | 1: |
||

1044 | vld1.8 {d2,d3}, [r2], r3 |
||

1045 | |||

1046 | vp8_epel8_h6 d2, d2, d3 |
||

1047 | |||

1048 | vst1.8 {d2}, [r0,:64], r1 |
||

1049 | subs r12, r12, #1 |
||

1050 | bne 1b |
||

1051 | |||

1052 | pop {r4,pc} |
||

1053 | endfunc |
||

1054 | |||

1055 | function ff_put_vp8_epel8_h6v6_neon, export=1 |
||

1056 | sub r2, r2, r3, lsl #1 |
||

1057 | sub r2, r2, #2 |
||

1058 | push {r4,lr} |
||

1059 | |||

1060 | @ first pass (horizontal): |
||

1061 | ldr r4, [sp, #12] @ mx |
||

1062 | movrel lr, subpel_filters-16 |
||

1063 | ldr r12, [sp, #8] @ h |
||

1064 | add r4, lr, r4, lsl #4 |
||

1065 | sub sp, sp, #168+16 |
||

1066 | vld1.16 {q0}, [r4,:128] |
||

1067 | add lr, sp, #15 |
||

1068 | add r12, r12, #5 |
||

1069 | bic lr, lr, #15 |
||

1070 | 1: |
||

1071 | vld1.8 {d2,d3}, [r2], r3 |
||

1072 | |||

1073 | vp8_epel8_h6 d2, d2, d3 |
||

1074 | |||

1075 | vst1.8 {d2}, [lr,:64]! |
||

1076 | subs r12, r12, #1 |
||

1077 | bne 1b |
||

1078 | |||

1079 | @ second pass (vertical): |
||

1080 | ldr r4, [sp, #168+16+16] @ my |
||

1081 | movrel lr, subpel_filters-16 |
||

1082 | ldr r12, [sp, #168+16+8] @ h |
||

1083 | add r4, lr, r4, lsl #4 |
||

1084 | add lr, sp, #15 |
||

1085 | vld1.16 {q0}, [r4,:128] |
||

1086 | bic lr, lr, #15 |
||

1087 | 2: |
||

1088 | vld1.8 {d2-d5}, [lr,:128]! |
||

1089 | vld1.8 {d6-d7}, [lr,:128]! |
||

1090 | vld1.8 {d30}, [lr,:64] |
||

1091 | sub lr, lr, #32 |
||

1092 | |||

1093 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |
||

1094 | |||

1095 | vst1.8 {d2}, [r0,:64], r1 |
||

1096 | vst1.8 {d3}, [r0,:64], r1 |
||

1097 | subs r12, r12, #2 |
||

1098 | bne 2b |
||

1099 | |||

1100 | add sp, sp, #168+16 |
||

1101 | pop {r4,pc} |
||

1102 | endfunc |
||

1103 | |||

1104 | function ff_put_vp8_epel8_v4_neon, export=1 |
||

1105 | sub r2, r2, r3 |
||

1106 | push {r4,lr} |
||

1107 | |||

1108 | ldr r4, [sp, #16] @ my |
||

1109 | movrel lr, subpel_filters-16 |
||

1110 | ldr r12, [sp, #8] @ h |
||

1111 | add r4, lr, r4, lsl #4 |
||

1112 | vld1.16 {q0}, [r4,:128] |
||

1113 | 1: |
||

1114 | vld1.8 {d2}, [r2], r3 |
||

1115 | vld1.8 {d3}, [r2], r3 |
||

1116 | vld1.8 {d4}, [r2], r3 |
||

1117 | vld1.8 {d5}, [r2], r3 |
||

1118 | vld1.8 {d6}, [r2] |
||

1119 | sub r2, r2, r3, lsl #1 |
||

1120 | |||

1121 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1122 | |||

1123 | vst1.8 {d2}, [r0,:64], r1 |
||

1124 | vst1.8 {d3}, [r0,:64], r1 |
||

1125 | subs r12, r12, #2 |
||

1126 | bne 1b |
||

1127 | |||

1128 | pop {r4,pc} |
||

1129 | endfunc |
||

1130 | |||

1131 | function ff_put_vp8_epel8_h4_neon, export=1 |
||

1132 | sub r2, r2, #1 |
||

1133 | push {r4,lr} |
||

1134 | |||

1135 | ldr r4, [sp, #12] @ mx |
||

1136 | movrel lr, subpel_filters-16 |
||

1137 | ldr r12, [sp, #8] @ h |
||

1138 | add r4, lr, r4, lsl #4 |
||

1139 | vld1.16 {q0}, [r4,:128] |
||

1140 | 1: |
||

1141 | vld1.8 {d2,d3}, [r2], r3 |
||

1142 | |||

1143 | vp8_epel8_h4 d2, d2, d3 |
||

1144 | |||

1145 | vst1.8 {d2}, [r0,:64], r1 |
||

1146 | subs r12, r12, #1 |
||

1147 | bne 1b |
||

1148 | |||

1149 | pop {r4,pc} |
||

1150 | endfunc |
||

1151 | |||

1152 | function ff_put_vp8_epel8_h4v4_neon, export=1 |
||

1153 | sub r2, r2, r3 |
||

1154 | sub r2, r2, #1 |
||

1155 | push {r4,lr} |
||

1156 | |||

1157 | @ first pass (horizontal): |
||

1158 | ldr r4, [sp, #12] @ mx |
||

1159 | movrel lr, subpel_filters-16 |
||

1160 | ldr r12, [sp, #8] @ h |
||

1161 | add r4, lr, r4, lsl #4 |
||

1162 | sub sp, sp, #168+16 |
||

1163 | vld1.16 {q0}, [r4,:128] |
||

1164 | add lr, sp, #15 |
||

1165 | add r12, r12, #3 |
||

1166 | bic lr, lr, #15 |
||

1167 | 1: |
||

1168 | vld1.8 {d2,d3}, [r2], r3 |
||

1169 | |||

1170 | vp8_epel8_h4 d2, d2, d3 |
||

1171 | |||

1172 | vst1.8 {d2}, [lr,:64]! |
||

1173 | subs r12, r12, #1 |
||

1174 | bne 1b |
||

1175 | |||

1176 | @ second pass (vertical): |
||

1177 | ldr r4, [sp, #168+16+16] @ my |
||

1178 | movrel lr, subpel_filters-16 |
||

1179 | ldr r12, [sp, #168+16+8] @ h |
||

1180 | add r4, lr, r4, lsl #4 |
||

1181 | add lr, sp, #15 |
||

1182 | vld1.16 {q0}, [r4,:128] |
||

1183 | bic lr, lr, #15 |
||

1184 | 2: |
||

1185 | vld1.8 {d2-d5}, [lr,:128]! |
||

1186 | vld1.8 {d6}, [lr,:64] |
||

1187 | sub lr, lr, #16 |
||

1188 | |||

1189 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1190 | |||

1191 | vst1.8 {d2}, [r0,:64], r1 |
||

1192 | vst1.8 {d3}, [r0,:64], r1 |
||

1193 | subs r12, r12, #2 |
||

1194 | bne 2b |
||

1195 | |||

1196 | add sp, sp, #168+16 |
||

1197 | pop {r4,pc} |
||

1198 | endfunc |
||

1199 | |||

1200 | function ff_put_vp8_epel8_h6v4_neon, export=1 |
||

1201 | sub r2, r2, r3 |
||

1202 | sub r2, r2, #2 |
||

1203 | push {r4,lr} |
||

1204 | |||

1205 | @ first pass (horizontal): |
||

1206 | ldr r4, [sp, #12] @ mx |
||

1207 | movrel lr, subpel_filters-16 |
||

1208 | ldr r12, [sp, #8] @ h |
||

1209 | add r4, lr, r4, lsl #4 |
||

1210 | sub sp, sp, #168+16 |
||

1211 | vld1.16 {q0}, [r4,:128] |
||

1212 | add lr, sp, #15 |
||

1213 | add r12, r12, #3 |
||

1214 | bic lr, lr, #15 |
||

1215 | 1: |
||

1216 | vld1.8 {d2,d3}, [r2], r3 |
||

1217 | |||

1218 | vp8_epel8_h6 d2, d2, d3 |
||

1219 | |||

1220 | vst1.8 {d2}, [lr,:64]! |
||

1221 | subs r12, r12, #1 |
||

1222 | bne 1b |
||

1223 | |||

1224 | @ second pass (vertical): |
||

1225 | ldr r4, [sp, #168+16+16] @ my |
||

1226 | movrel lr, subpel_filters-16 |
||

1227 | ldr r12, [sp, #168+16+8] @ h |
||

1228 | add r4, lr, r4, lsl #4 |
||

1229 | add lr, sp, #15 |
||

1230 | vld1.16 {q0}, [r4,:128] |
||

1231 | bic lr, lr, #15 |
||

1232 | 2: |
||

1233 | vld1.8 {d2-d5}, [lr,:128]! |
||

1234 | vld1.8 {d6}, [lr,:64] |
||

1235 | sub lr, lr, #16 |
||

1236 | |||

1237 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1238 | |||

1239 | vst1.8 {d2}, [r0,:64], r1 |
||

1240 | vst1.8 {d3}, [r0,:64], r1 |
||

1241 | subs r12, r12, #2 |
||

1242 | bne 2b |
||

1243 | |||

1244 | add sp, sp, #168+16 |
||

1245 | pop {r4,pc} |
||

1246 | endfunc |
||

1247 | |||

1248 | function ff_put_vp8_epel8_h4v6_neon, export=1 |
||

1249 | sub r2, r2, r3, lsl #1 |
||

1250 | sub r2, r2, #1 |
||

1251 | push {r4,lr} |
||

1252 | |||

1253 | @ first pass (horizontal): |
||

1254 | ldr r4, [sp, #12] @ mx |
||

1255 | movrel lr, subpel_filters-16 |
||

1256 | ldr r12, [sp, #8] @ h |
||

1257 | add r4, lr, r4, lsl #4 |
||

1258 | sub sp, sp, #168+16 |
||

1259 | vld1.16 {q0}, [r4,:128] |
||

1260 | add lr, sp, #15 |
||

1261 | add r12, r12, #5 |
||

1262 | bic lr, lr, #15 |
||

1263 | 1: |
||

1264 | vld1.8 {d2,d3}, [r2], r3 |
||

1265 | |||

1266 | vp8_epel8_h4 d2, d2, d3 |
||

1267 | |||

1268 | vst1.8 {d2}, [lr,:64]! |
||

1269 | subs r12, r12, #1 |
||

1270 | bne 1b |
||

1271 | |||

1272 | @ second pass (vertical): |
||

1273 | ldr r4, [sp, #168+16+16] @ my |
||

1274 | movrel lr, subpel_filters-16 |
||

1275 | ldr r12, [sp, #168+16+8] @ h |
||

1276 | add r4, lr, r4, lsl #4 |
||

1277 | add lr, sp, #15 |
||

1278 | vld1.16 {q0}, [r4,:128] |
||

1279 | bic lr, lr, #15 |
||

1280 | 2: |
||

1281 | vld1.8 {d2-d5}, [lr,:128]! |
||

1282 | vld1.8 {d6-d7}, [lr,:128]! |
||

1283 | vld1.8 {d30}, [lr,:64] |
||

1284 | sub lr, lr, #32 |
||

1285 | |||

1286 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d30 |
||

1287 | |||

1288 | vst1.8 {d2}, [r0,:64], r1 |
||

1289 | vst1.8 {d3}, [r0,:64], r1 |
||

1290 | subs r12, r12, #2 |
||

1291 | bne 2b |
||

1292 | |||

1293 | add sp, sp, #168+16 |
||

1294 | pop {r4,pc} |
||

1295 | endfunc |
||

1296 | |||

1297 | function ff_put_vp8_epel4_v6_neon, export=1 |
||

1298 | sub r2, r2, r3, lsl #1 |
||

1299 | push {r4,lr} |
||

1300 | |||

1301 | ldr r4, [sp, #16] @ my |
||

1302 | movrel lr, subpel_filters-16 |
||

1303 | ldr r12, [sp, #8] @ h |
||

1304 | add r4, lr, r4, lsl #4 |
||

1305 | vld1.16 {q0}, [r4,:128] |
||

1306 | 1: |
||

1307 | vld1.32 {d2[]}, [r2], r3 |
||

1308 | vld1.32 {d3[]}, [r2], r3 |
||

1309 | vld1.32 {d4[]}, [r2], r3 |
||

1310 | vld1.32 {d5[]}, [r2], r3 |
||

1311 | vld1.32 {d6[]}, [r2], r3 |
||

1312 | vld1.32 {d7[]}, [r2], r3 |
||

1313 | vld1.32 {d28[]}, [r2] |
||

1314 | sub r2, r2, r3, lsl #2 |
||

1315 | vld1.32 {d2[1]}, [r2], r3 |
||

1316 | vld1.32 {d3[1]}, [r2], r3 |
||

1317 | vld1.32 {d4[1]}, [r2], r3 |
||

1318 | vld1.32 {d5[1]}, [r2], r3 |
||

1319 | vld1.32 {d6[1]}, [r2], r3 |
||

1320 | vld1.32 {d7[1]}, [r2], r3 |
||

1321 | vld1.32 {d28[1]}, [r2] |
||

1322 | sub r2, r2, r3, lsl #2 |
||

1323 | |||

1324 | vp8_epel8_v6_y2 d2, d3, d2, d3, d4, d5, d6, d7, d28 |
||

1325 | |||

1326 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1327 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1328 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1329 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1330 | subs r12, r12, #4 |
||

1331 | bne 1b |
||

1332 | |||

1333 | pop {r4,pc} |
||

1334 | endfunc |
||

1335 | |||

1336 | function ff_put_vp8_epel4_h6_neon, export=1 |
||

1337 | sub r2, r2, #2 |
||

1338 | push {r4,lr} |
||

1339 | |||

1340 | ldr r4, [sp, #12] @ mx |
||

1341 | movrel lr, subpel_filters-16 |
||

1342 | ldr r12, [sp, #8] @ h |
||

1343 | add r4, lr, r4, lsl #4 |
||

1344 | vld1.16 {q0}, [r4,:128] |
||

1345 | 1: |
||

1346 | vld1.8 {q1}, [r2], r3 |
||

1347 | vp8_epel8_h6 d2, d2, d3 |
||

1348 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1349 | subs r12, r12, #1 |
||

1350 | bne 1b |
||

1351 | |||

1352 | pop {r4,pc} |
||

1353 | endfunc |
||

1354 | |||

1355 | function ff_put_vp8_epel4_h6v6_neon, export=1 |
||

1356 | sub r2, r2, r3, lsl #1 |
||

1357 | sub r2, r2, #2 |
||

1358 | push {r4,lr} |
||

1359 | |||

1360 | ldr r4, [sp, #12] @ mx |
||

1361 | movrel lr, subpel_filters-16 |
||

1362 | ldr r12, [sp, #8] @ h |
||

1363 | add r4, lr, r4, lsl #4 |
||

1364 | sub sp, sp, #52+16 |
||

1365 | vld1.16 {q0}, [r4,:128] |
||

1366 | add lr, sp, #15 |
||

1367 | add r12, r12, #5 |
||

1368 | bic lr, lr, #15 |
||

1369 | 1: |
||

1370 | vld1.8 {q1}, [r2], r3 |
||

1371 | vp8_epel8_h6 d2, d2, d3 |
||

1372 | vst1.32 {d2[0]}, [lr,:32]! |
||

1373 | subs r12, r12, #1 |
||

1374 | bne 1b |
||

1375 | |||

1376 | ldr r4, [sp, #52+16+16] @ my |
||

1377 | movrel lr, subpel_filters-16 |
||

1378 | ldr r12, [sp, #52+16+8] @ h |
||

1379 | add r4, lr, r4, lsl #4 |
||

1380 | add lr, sp, #15 |
||

1381 | vld1.16 {q0}, [r4,:128] |
||

1382 | bic lr, lr, #15 |
||

1383 | 2: |
||

1384 | vld1.8 {d2-d3}, [lr,:128]! |
||

1385 | vld1.8 {d6}, [lr,:64]! |
||

1386 | vld1.32 {d28[]}, [lr,:32] |
||

1387 | sub lr, lr, #16 |
||

1388 | vld1.8 {d4-d5}, [lr]! |
||

1389 | vld1.8 {d7}, [lr,:64]! |
||

1390 | vld1.32 {d28[1]}, [lr,:32] |
||

1391 | sub lr, lr, #16 |
||

1392 | vtrn.32 q1, q2 |
||

1393 | vtrn.32 d6, d7 |
||

1394 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |
||

1395 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1396 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1397 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1398 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1399 | subs r12, r12, #4 |
||

1400 | bne 2b |
||

1401 | |||

1402 | add sp, sp, #52+16 |
||

1403 | pop {r4,pc} |
||

1404 | endfunc |
||

1405 | |||

1406 | function ff_put_vp8_epel4_h4v6_neon, export=1 |
||

1407 | sub r2, r2, r3, lsl #1 |
||

1408 | sub r2, r2, #1 |
||

1409 | push {r4,lr} |
||

1410 | |||

1411 | ldr r4, [sp, #12] @ mx |
||

1412 | movrel lr, subpel_filters-16 |
||

1413 | ldr r12, [sp, #8] @ h |
||

1414 | add r4, lr, r4, lsl #4 |
||

1415 | sub sp, sp, #52+16 |
||

1416 | vld1.16 {q0}, [r4,:128] |
||

1417 | add lr, sp, #15 |
||

1418 | add r12, r12, #5 |
||

1419 | bic lr, lr, #15 |
||

1420 | 1: |
||

1421 | vld1.8 {d2}, [r2], r3 |
||

1422 | vp8_epel8_h4 d2, d2, d2 |
||

1423 | vst1.32 {d2[0]}, [lr,:32]! |
||

1424 | subs r12, r12, #1 |
||

1425 | bne 1b |
||

1426 | |||

1427 | ldr r4, [sp, #52+16+16] @ my |
||

1428 | movrel lr, subpel_filters-16 |
||

1429 | ldr r12, [sp, #52+16+8] @ h |
||

1430 | add r4, lr, r4, lsl #4 |
||

1431 | add lr, sp, #15 |
||

1432 | vld1.16 {q0}, [r4,:128] |
||

1433 | bic lr, lr, #15 |
||

1434 | 2: |
||

1435 | vld1.8 {d2-d3}, [lr,:128]! |
||

1436 | vld1.8 {d6}, [lr,:64]! |
||

1437 | vld1.32 {d28[]}, [lr,:32] |
||

1438 | sub lr, lr, #16 |
||

1439 | vld1.8 {d4-d5}, [lr]! |
||

1440 | vld1.8 {d7}, [lr,:64]! |
||

1441 | vld1.32 {d28[1]}, [lr,:32] |
||

1442 | sub lr, lr, #16 |
||

1443 | vtrn.32 q1, q2 |
||

1444 | vtrn.32 d6, d7 |
||

1445 | vp8_epel8_v6_y2 d2, d3, d2, d4, d3, d5, d6, d7, d28 |
||

1446 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1447 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1448 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1449 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1450 | subs r12, r12, #4 |
||

1451 | bne 2b |
||

1452 | |||

1453 | add sp, sp, #52+16 |
||

1454 | pop {r4,pc} |
||

1455 | endfunc |
||

1456 | |||

1457 | function ff_put_vp8_epel4_h6v4_neon, export=1 |
||

1458 | sub r2, r2, r3 |
||

1459 | sub r2, r2, #2 |
||

1460 | push {r4,lr} |
||

1461 | |||

1462 | ldr r4, [sp, #12] @ mx |
||

1463 | movrel lr, subpel_filters-16 |
||

1464 | ldr r12, [sp, #8] @ h |
||

1465 | add r4, lr, r4, lsl #4 |
||

1466 | sub sp, sp, #44+16 |
||

1467 | vld1.16 {q0}, [r4,:128] |
||

1468 | add lr, sp, #15 |
||

1469 | add r12, r12, #3 |
||

1470 | bic lr, lr, #15 |
||

1471 | 1: |
||

1472 | vld1.8 {q1}, [r2], r3 |
||

1473 | vp8_epel8_h6 d2, d2, d3 |
||

1474 | vst1.32 {d2[0]}, [lr,:32]! |
||

1475 | subs r12, r12, #1 |
||

1476 | bne 1b |
||

1477 | |||

1478 | ldr r4, [sp, #44+16+16] @ my |
||

1479 | movrel lr, subpel_filters-16 |
||

1480 | ldr r12, [sp, #44+16+8] @ h |
||

1481 | add r4, lr, r4, lsl #4 |
||

1482 | add lr, sp, #15 |
||

1483 | vld1.16 {q0}, [r4,:128] |
||

1484 | bic lr, lr, #15 |
||

1485 | 2: |
||

1486 | vld1.8 {d2-d3}, [lr,:128]! |
||

1487 | vld1.32 {d6[]}, [lr,:32] |
||

1488 | sub lr, lr, #8 |
||

1489 | vld1.8 {d4-d5}, [lr]! |
||

1490 | vld1.32 {d6[1]}, [lr,:32] |
||

1491 | sub lr, lr, #8 |
||

1492 | vtrn.32 q1, q2 |
||

1493 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |
||

1494 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1495 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1496 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1497 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1498 | subs r12, r12, #4 |
||

1499 | bne 2b |
||

1500 | |||

1501 | add sp, sp, #44+16 |
||

1502 | pop {r4,pc} |
||

1503 | endfunc |
||

1504 | |||

1505 | function ff_put_vp8_epel4_h4_neon, export=1 |
||

1506 | sub r2, r2, #1 |
||

1507 | push {r4,lr} |
||

1508 | |||

1509 | ldr r4, [sp, #12] @ mx |
||

1510 | movrel lr, subpel_filters-16 |
||

1511 | ldr r12, [sp, #8] @ h |
||

1512 | add r4, lr, r4, lsl #4 |
||

1513 | vld1.16 {q0}, [r4,:128] |
||

1514 | 1: |
||

1515 | vld1.8 {d2}, [r2], r3 |
||

1516 | vp8_epel8_h4 d2, d2, d2 |
||

1517 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1518 | subs r12, r12, #1 |
||

1519 | bne 1b |
||

1520 | |||

1521 | pop {r4,pc} |
||

1522 | endfunc |
||

1523 | |||

1524 | function ff_put_vp8_epel4_v4_neon, export=1 |
||

1525 | sub r2, r2, r3 |
||

1526 | push {r4,lr} |
||

1527 | |||

1528 | ldr r4, [sp, #16] @ my |
||

1529 | movrel lr, subpel_filters-16 |
||

1530 | ldr r12, [sp, #8] @ h |
||

1531 | add r4, lr, r4, lsl #4 |
||

1532 | vld1.16 {q0}, [r4,:128] |
||

1533 | 1: |
||

1534 | vld1.32 {d2[]}, [r2], r3 |
||

1535 | vld1.32 {d3[]}, [r2], r3 |
||

1536 | vld1.32 {d4[]}, [r2], r3 |
||

1537 | vld1.32 {d5[]}, [r2], r3 |
||

1538 | vld1.32 {d6[]}, [r2] |
||

1539 | sub r2, r2, r3, lsl #1 |
||

1540 | vld1.32 {d2[1]}, [r2], r3 |
||

1541 | vld1.32 {d3[1]}, [r2], r3 |
||

1542 | vld1.32 {d4[1]}, [r2], r3 |
||

1543 | vld1.32 {d5[1]}, [r2], r3 |
||

1544 | vld1.32 {d6[1]}, [r2] |
||

1545 | sub r2, r2, r3, lsl #1 |
||

1546 | |||

1547 | vp8_epel8_v4_y2 d2, d3, d2, d3, d4, d5, d6 |
||

1548 | |||

1549 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1550 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1551 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1552 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1553 | subs r12, r12, #4 |
||

1554 | bne 1b |
||

1555 | |||

1556 | pop {r4,pc} |
||

1557 | endfunc |
||

1558 | |||

1559 | function ff_put_vp8_epel4_h4v4_neon, export=1 |
||

1560 | sub r2, r2, r3 |
||

1561 | sub r2, r2, #1 |
||

1562 | push {r4,lr} |
||

1563 | |||

1564 | ldr r4, [sp, #12] @ mx |
||

1565 | movrel lr, subpel_filters-16 |
||

1566 | ldr r12, [sp, #8] @ h |
||

1567 | add r4, lr, r4, lsl #4 |
||

1568 | sub sp, sp, #44+16 |
||

1569 | vld1.16 {q0}, [r4,:128] |
||

1570 | add lr, sp, #15 |
||

1571 | add r12, r12, #3 |
||

1572 | bic lr, lr, #15 |
||

1573 | 1: |
||

1574 | vld1.8 {d2}, [r2], r3 |
||

1575 | vp8_epel8_h4 d2, d2, d3 |
||

1576 | vst1.32 {d2[0]}, [lr,:32]! |
||

1577 | subs r12, r12, #1 |
||

1578 | bne 1b |
||

1579 | |||

1580 | ldr r4, [sp, #44+16+16] @ my |
||

1581 | movrel lr, subpel_filters-16 |
||

1582 | ldr r12, [sp, #44+16+8] @ h |
||

1583 | add r4, lr, r4, lsl #4 |
||

1584 | add lr, sp, #15 |
||

1585 | vld1.16 {q0}, [r4,:128] |
||

1586 | bic lr, lr, #15 |
||

1587 | 2: |
||

1588 | vld1.8 {d2-d3}, [lr,:128]! |
||

1589 | vld1.32 {d6[]}, [lr,:32] |
||

1590 | sub lr, lr, #8 |
||

1591 | vld1.8 {d4-d5}, [lr]! |
||

1592 | vld1.32 {d6[1]}, [lr,:32] |
||

1593 | sub lr, lr, #8 |
||

1594 | vtrn.32 q1, q2 |
||

1595 | vp8_epel8_v4_y2 d2, d3, d2, d4, d3, d5, d6 |
||

1596 | vst1.32 {d2[0]}, [r0,:32], r1 |
||

1597 | vst1.32 {d3[0]}, [r0,:32], r1 |
||

1598 | vst1.32 {d2[1]}, [r0,:32], r1 |
||

1599 | vst1.32 {d3[1]}, [r0,:32], r1 |
||

1600 | subs r12, r12, #4 |
||

1601 | bne 2b |
||

1602 | |||

1603 | add sp, sp, #44+16 |
||

1604 | pop {r4,pc} |
||

1605 | endfunc |
||

1606 | |||

1607 | @ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit |
||

1608 | @ arithmatic can be used to apply filters |
||

1609 | const subpel_filters, align=4 |
||

1610 | .short 0, 6, 123, 12, 1, 0, 0, 0 |
||

1611 | .short 2, 11, 108, 36, 8, 1, 0, 0 |
||

1612 | .short 0, 9, 93, 50, 6, 0, 0, 0 |
||

1613 | .short 3, 16, 77, 77, 16, 3, 0, 0 |
||

1614 | .short 0, 6, 50, 93, 9, 0, 0, 0 |
||

1615 | .short 1, 8, 36, 108, 11, 2, 0, 0 |
||

1616 | .short 0, 1, 12, 123, 6, 0, 0, 0 |
||

1617 | endconst |
||

1618 | |||

1619 | /* Bilinear MC */ |
||

1620 | |||

1621 | function ff_put_vp8_bilin16_h_neon, export=1 |
||

1622 | ldr r3, [sp, #4] @ mx |
||

1623 | rsb r12, r3, #8 |
||

1624 | vdup.8 d0, r3 |
||

1625 | vdup.8 d1, r12 |
||

1626 | ldr r12, [sp] @ h |
||

1627 | 1: |
||

1628 | subs r12, r12, #2 |
||

1629 | vld1.8 {d2-d4}, [r2], r1 |
||

1630 | vext.8 q2, q1, q2, #1 |
||

1631 | vmull.u8 q8, d2, d1 |
||

1632 | vmlal.u8 q8, d4, d0 |
||

1633 | vld1.8 {d18-d20},[r2], r1 |
||

1634 | vmull.u8 q3, d3, d1 |
||

1635 | vmlal.u8 q3, d5, d0 |
||

1636 | vext.8 q10, q9, q10, #1 |
||

1637 | vmull.u8 q11, d18, d1 |
||

1638 | vmlal.u8 q11, d20, d0 |
||

1639 | vmull.u8 q12, d19, d1 |
||

1640 | vmlal.u8 q12, d21, d0 |
||

1641 | vrshrn.u16 d4, q8, #3 |
||

1642 | vrshrn.u16 d5, q3, #3 |
||

1643 | vrshrn.u16 d6, q11, #3 |
||

1644 | vrshrn.u16 d7, q12, #3 |
||

1645 | vst1.8 {q2}, [r0,:128], r1 |
||

1646 | vst1.8 {q3}, [r0,:128], r1 |
||

1647 | bgt 1b |
||

1648 | |||

1649 | bx lr |
||

1650 | endfunc |
||

1651 | |||

1652 | function ff_put_vp8_bilin16_v_neon, export=1 |
||

1653 | ldr r3, [sp, #8] @ my |
||

1654 | rsb r12, r3, #8 |
||

1655 | vdup.8 d0, r3 |
||

1656 | vdup.8 d1, r12 |
||

1657 | ldr r12, [sp] @ h |
||

1658 | vld1.8 {q1}, [r2], r1 |
||

1659 | 1: |
||

1660 | subs r12, r12, #2 |
||

1661 | vld1.8 {q2}, [r2], r1 |
||

1662 | vmull.u8 q3, d2, d1 |
||

1663 | vmlal.u8 q3, d4, d0 |
||

1664 | vmull.u8 q8, d3, d1 |
||

1665 | vmlal.u8 q8, d5, d0 |
||

1666 | vld1.8 {q1}, [r2], r1 |
||

1667 | vmull.u8 q9, d4, d1 |
||

1668 | vmlal.u8 q9, d2, d0 |
||

1669 | vmull.u8 q10, d5, d1 |
||

1670 | vmlal.u8 q10, d3, d0 |
||

1671 | vrshrn.u16 d4, q3, #3 |
||

1672 | vrshrn.u16 d5, q8, #3 |
||

1673 | vrshrn.u16 d6, q9, #3 |
||

1674 | vrshrn.u16 d7, q10, #3 |
||

1675 | vst1.8 {q2}, [r0,:128], r1 |
||

1676 | vst1.8 {q3}, [r0,:128], r1 |
||

1677 | bgt 1b |
||

1678 | |||

1679 | bx lr |
||

1680 | endfunc |
||

1681 | |||

1682 | function ff_put_vp8_bilin16_hv_neon, export=1 |
||

1683 | ldr r3, [sp, #4] @ mx |
||

1684 | rsb r12, r3, #8 |
||

1685 | vdup.8 d0, r3 |
||

1686 | vdup.8 d1, r12 |
||

1687 | ldr r3, [sp, #8] @ my |
||

1688 | rsb r12, r3, #8 |
||

1689 | vdup.8 d2, r3 |
||

1690 | vdup.8 d3, r12 |
||

1691 | ldr r12, [sp] @ h |
||

1692 | |||

1693 | vld1.8 {d4-d6}, [r2], r1 |
||

1694 | vext.8 q3, q2, q3, #1 |
||

1695 | vmull.u8 q8, d4, d1 |
||

1696 | vmlal.u8 q8, d6, d0 |
||

1697 | vmull.u8 q9, d5, d1 |
||

1698 | vmlal.u8 q9, d7, d0 |
||

1699 | vrshrn.u16 d4, q8, #3 |
||

1700 | vrshrn.u16 d5, q9, #3 |
||

1701 | 1: |
||

1702 | subs r12, r12, #2 |
||

1703 | vld1.8 {d18-d20},[r2], r1 |
||

1704 | vext.8 q10, q9, q10, #1 |
||

1705 | vmull.u8 q11, d18, d1 |
||

1706 | vmlal.u8 q11, d20, d0 |
||

1707 | vld1.8 {d26-d28},[r2], r1 |
||

1708 | vmull.u8 q12, d19, d1 |
||

1709 | vmlal.u8 q12, d21, d0 |
||

1710 | vext.8 q14, q13, q14, #1 |
||

1711 | vmull.u8 q8, d26, d1 |
||

1712 | vmlal.u8 q8, d28, d0 |
||

1713 | vmull.u8 q9, d27, d1 |
||

1714 | vmlal.u8 q9, d29, d0 |
||

1715 | vrshrn.u16 d6, q11, #3 |
||

1716 | vrshrn.u16 d7, q12, #3 |
||

1717 | vmull.u8 q12, d4, d3 |
||

1718 | vmlal.u8 q12, d6, d2 |
||

1719 | vmull.u8 q15, d5, d3 |
||

1720 | vmlal.u8 q15, d7, d2 |
||

1721 | vrshrn.u16 d4, q8, #3 |
||

1722 | vrshrn.u16 d5, q9, #3 |
||

1723 | vmull.u8 q10, d6, d3 |
||

1724 | vmlal.u8 q10, d4, d2 |
||

1725 | vmull.u8 q11, d7, d3 |
||

1726 | vmlal.u8 q11, d5, d2 |
||

1727 | vrshrn.u16 d24, q12, #3 |
||

1728 | vrshrn.u16 d25, q15, #3 |
||

1729 | vst1.8 {q12}, [r0,:128], r1 |
||

1730 | vrshrn.u16 d20, q10, #3 |
||

1731 | vrshrn.u16 d21, q11, #3 |
||

1732 | vst1.8 {q10}, [r0,:128], r1 |
||

1733 | bgt 1b |
||

1734 | |||

1735 | bx lr |
||

1736 | endfunc |
||

1737 | |||

1738 | function ff_put_vp8_bilin8_h_neon, export=1 |
||

1739 | ldr r3, [sp, #4] @ mx |
||

1740 | rsb r12, r3, #8 |
||

1741 | vdup.8 d0, r3 |
||

1742 | vdup.8 d1, r12 |
||

1743 | ldr r12, [sp] @ h |
||

1744 | 1: |
||

1745 | subs r12, r12, #2 |
||

1746 | vld1.8 {q1}, [r2], r1 |
||

1747 | vext.8 d3, d2, d3, #1 |
||

1748 | vmull.u8 q2, d2, d1 |
||

1749 | vmlal.u8 q2, d3, d0 |
||

1750 | vld1.8 {q3}, [r2], r1 |
||

1751 | vext.8 d7, d6, d7, #1 |
||

1752 | vmull.u8 q8, d6, d1 |
||

1753 | vmlal.u8 q8, d7, d0 |
||

1754 | vrshrn.u16 d4, q2, #3 |
||

1755 | vrshrn.u16 d16, q8, #3 |
||

1756 | vst1.8 {d4}, [r0,:64], r1 |
||

1757 | vst1.8 {d16}, [r0,:64], r1 |
||

1758 | bgt 1b |
||

1759 | |||

1760 | bx lr |
||

1761 | endfunc |
||

1762 | |||

1763 | function ff_put_vp8_bilin8_v_neon, export=1 |
||

1764 | ldr r3, [sp, #8] @ my |
||

1765 | rsb r12, r3, #8 |
||

1766 | vdup.8 d0, r3 |
||

1767 | vdup.8 d1, r12 |
||

1768 | ldr r12, [sp] @ h |
||

1769 | vld1.8 {d2}, [r2], r1 |
||

1770 | 1: |
||

1771 | subs r12, r12, #2 |
||

1772 | vld1.8 {d3}, [r2], r1 |
||

1773 | vmull.u8 q2, d2, d1 |
||

1774 | vmlal.u8 q2, d3, d0 |
||

1775 | vld1.8 {d2}, [r2], r1 |
||

1776 | vmull.u8 q3, d3, d1 |
||

1777 | vmlal.u8 q3, d2, d0 |
||

1778 | vrshrn.u16 d4, q2, #3 |
||

1779 | vrshrn.u16 d6, q3, #3 |
||

1780 | vst1.8 {d4}, [r0,:64], r1 |
||

1781 | vst1.8 {d6}, [r0,:64], r1 |
||

1782 | bgt 1b |
||

1783 | |||

1784 | bx lr |
||

1785 | endfunc |
||

1786 | |||

1787 | function ff_put_vp8_bilin8_hv_neon, export=1 |
||

1788 | ldr r3, [sp, #4] @ mx |
||

1789 | rsb r12, r3, #8 |
||

1790 | vdup.8 d0, r3 |
||

1791 | vdup.8 d1, r12 |
||

1792 | ldr r3, [sp, #8] @ my |
||

1793 | rsb r12, r3, #8 |
||

1794 | vdup.8 d2, r3 |
||

1795 | vdup.8 d3, r12 |
||

1796 | ldr r12, [sp] @ h |
||

1797 | |||

1798 | vld1.8 {q2}, [r2], r1 |
||

1799 | vext.8 d5, d4, d5, #1 |
||

1800 | vmull.u8 q9, d4, d1 |
||

1801 | vmlal.u8 q9, d5, d0 |
||

1802 | vrshrn.u16 d22, q9, #3 |
||

1803 | 1: |
||

1804 | subs r12, r12, #2 |
||

1805 | vld1.8 {q3}, [r2], r1 |
||

1806 | vext.8 d7, d6, d7, #1 |
||

1807 | vmull.u8 q8, d6, d1 |
||

1808 | vmlal.u8 q8, d7, d0 |
||

1809 | vld1.8 {q2}, [r2], r1 |
||

1810 | vext.8 d5, d4, d5, #1 |
||

1811 | vmull.u8 q9, d4, d1 |
||

1812 | vmlal.u8 q9, d5, d0 |
||

1813 | vrshrn.u16 d16, q8, #3 |
||

1814 | vmull.u8 q10, d22, d3 |
||

1815 | vmlal.u8 q10, d16, d2 |
||

1816 | vrshrn.u16 d22, q9, #3 |
||

1817 | vmull.u8 q12, d16, d3 |
||

1818 | vmlal.u8 q12, d22, d2 |
||

1819 | vrshrn.u16 d20, q10, #3 |
||

1820 | vst1.8 {d20}, [r0,:64], r1 |
||

1821 | vrshrn.u16 d23, q12, #3 |
||

1822 | vst1.8 {d23}, [r0,:64], r1 |
||

1823 | bgt 1b |
||

1824 | |||

1825 | bx lr |
||

1826 | endfunc |
||

1827 | |||

1828 | function ff_put_vp8_bilin4_h_neon, export=1 |
||

1829 | ldr r3, [sp, #4] @ mx |
||

1830 | rsb r12, r3, #8 |
||

1831 | vdup.8 d0, r3 |
||

1832 | vdup.8 d1, r12 |
||

1833 | ldr r12, [sp] @ h |
||

1834 | 1: |
||

1835 | subs r12, r12, #2 |
||

1836 | vld1.8 {d2}, [r2], r1 |
||

1837 | vext.8 d3, d2, d3, #1 |
||

1838 | vld1.8 {d6}, [r2], r1 |
||

1839 | vext.8 d7, d6, d7, #1 |
||

1840 | vtrn.32 q1, q3 |
||

1841 | vmull.u8 q2, d2, d1 |
||

1842 | vmlal.u8 q2, d3, d0 |
||

1843 | vrshrn.u16 d4, q2, #3 |
||

1844 | vst1.32 {d4[0]}, [r0,:32], r1 |
||

1845 | vst1.32 {d4[1]}, [r0,:32], r1 |
||

1846 | bgt 1b |
||

1847 | |||

1848 | bx lr |
||

1849 | endfunc |
||

1850 | |||

1851 | function ff_put_vp8_bilin4_v_neon, export=1 |
||

1852 | ldr r3, [sp, #8] @ my |
||

1853 | rsb r12, r3, #8 |
||

1854 | vdup.8 d0, r3 |
||

1855 | vdup.8 d1, r12 |
||

1856 | ldr r12, [sp] @ h |
||

1857 | vld1.32 {d2[]}, [r2], r1 |
||

1858 | 1: |
||

1859 | vld1.32 {d3[]}, [r2] |
||

1860 | vld1.32 {d2[1]}, [r2], r1 |
||

1861 | vld1.32 {d3[1]}, [r2], r1 |
||

1862 | vmull.u8 q2, d2, d1 |
||

1863 | vmlal.u8 q2, d3, d0 |
||

1864 | vtrn.32 d3, d2 |
||

1865 | vrshrn.u16 d4, q2, #3 |
||

1866 | vst1.32 {d4[0]}, [r0,:32], r1 |
||

1867 | vst1.32 {d4[1]}, [r0,:32], r1 |
||

1868 | subs r12, r12, #2 |
||

1869 | bgt 1b |
||

1870 | |||

1871 | bx lr |
||

1872 | endfunc |
||

1873 | |||

1874 | function ff_put_vp8_bilin4_hv_neon, export=1 |
||

1875 | ldr r3, [sp, #4] @ mx |
||

1876 | rsb r12, r3, #8 |
||

1877 | vdup.8 d0, r3 |
||

1878 | vdup.8 d1, r12 |
||

1879 | ldr r3, [sp, #8] @ my |
||

1880 | rsb r12, r3, #8 |
||

1881 | vdup.8 d2, r3 |
||

1882 | vdup.8 d3, r12 |
||

1883 | ldr r12, [sp] @ h |
||

1884 | |||

1885 | vld1.8 {d4}, [r2], r1 |
||

1886 | vext.8 d5, d4, d4, #1 |
||

1887 | vmull.u8 q9, d4, d1 |
||

1888 | vmlal.u8 q9, d5, d0 |
||

1889 | vrshrn.u16 d22, q9, #3 |
||

1890 | 1: |
||

1891 | subs r12, r12, #2 |
||

1892 | vld1.8 {d6}, [r2], r1 |
||

1893 | vext.8 d7, d6, d6, #1 |
||

1894 | vld1.8 {d4}, [r2], r1 |
||

1895 | vext.8 d5, d4, d4, #1 |
||

1896 | vtrn.32 q3, q2 |
||

1897 | vmull.u8 q8, d6, d1 |
||

1898 | vmlal.u8 q8, d7, d0 |
||

1899 | vrshrn.u16 d16, q8, #3 |
||

1900 | vmull.u8 q10, d16, d2 |
||

1901 | vtrn.32 d22, d16 |
||

1902 | vmlal.u8 q10, d22, d3 |
||

1903 | vrev64.32 d22, d16 |
||

1904 | vrshrn.u16 d20, q10, #3 |
||

1905 | vst1.32 {d20[0]}, [r0,:32], r1 |
||

1906 | vst1.32 {d20[1]}, [r0,:32], r1 |
||

1907 | bgt 1b |
||

1908 | |||

1909 | bx lr |
||

1910 | endfunc |