ffmpeg / libavcodec / armv4l / dsputil_neon_s.S @ 569f5a75
History | View | Annotate | Download (8.75 KB)
1 |
/* |
---|---|
2 |
* ARM NEON optimised DSP functions |
3 |
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com> |
4 |
* |
5 |
* This file is part of FFmpeg. |
6 |
* |
7 |
* FFmpeg is free software; you can redistribute it and/or |
8 |
* modify it under the terms of the GNU Lesser General Public |
9 |
* License as published by the Free Software Foundation; either |
10 |
* version 2.1 of the License, or (at your option) any later version. |
11 |
* |
12 |
* FFmpeg is distributed in the hope that it will be useful, |
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 |
* Lesser General Public License for more details. |
16 |
* |
17 |
* You should have received a copy of the GNU Lesser General Public |
18 |
* License along with FFmpeg; if not, write to the Free Software |
19 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
20 |
*/ |
21 |
|
22 |
#include "asm.S" |
23 |
|
24 |
preserve8 |
25 |
.fpu neon |
26 |
.text |
27 |
|
28 |
.macro pixels16 avg=0 |
29 |
.if \avg |
30 |
mov ip, r0 |
31 |
.endif |
32 |
1: vld1.64 {d0, d1}, [r1], r2 |
33 |
vld1.64 {d2, d3}, [r1], r2 |
34 |
vld1.64 {d4, d5}, [r1], r2 |
35 |
pld [r1, r2, lsl #2] |
36 |
vld1.64 {d6, d7}, [r1], r2 |
37 |
pld [r1] |
38 |
pld [r1, r2] |
39 |
pld [r1, r2, lsl #1] |
40 |
.if \avg |
41 |
vld1.64 {d16,d17}, [ip], r2 |
42 |
vrhadd.u8 q0, q0, q8 |
43 |
vld1.64 {d18,d19}, [ip], r2 |
44 |
vrhadd.u8 q1, q1, q9 |
45 |
vld1.64 {d20,d21}, [ip], r2 |
46 |
vrhadd.u8 q2, q2, q10 |
47 |
vld1.64 {d22,d23}, [ip], r2 |
48 |
vrhadd.u8 q3, q3, q11 |
49 |
.endif |
50 |
subs r3, r3, #4 |
51 |
vst1.64 {d0, d1}, [r0,:128], r2 |
52 |
vst1.64 {d2, d3}, [r0,:128], r2 |
53 |
vst1.64 {d4, d5}, [r0,:128], r2 |
54 |
vst1.64 {d6, d7}, [r0,:128], r2 |
55 |
bne 1b |
56 |
bx lr |
57 |
.endm |
58 |
|
59 |
.macro pixels16_x2 vhadd=vrhadd.u8 |
60 |
1: vld1.64 {d0-d2}, [r1], r2 |
61 |
vld1.64 {d4-d6}, [r1], r2 |
62 |
pld [r1] |
63 |
pld [r1, r2] |
64 |
subs r3, r3, #2 |
65 |
vext.8 q1, q0, q1, #1 |
66 |
\vhadd q0, q0, q1 |
67 |
vext.8 q3, q2, q3, #1 |
68 |
\vhadd q2, q2, q3 |
69 |
vst1.64 {d0, d1}, [r0,:128], r2 |
70 |
vst1.64 {d4, d5}, [r0,:128], r2 |
71 |
bne 1b |
72 |
bx lr |
73 |
.endm |
74 |
|
75 |
.macro pixels16_y2 vhadd=vrhadd.u8 |
76 |
push {lr} |
77 |
add ip, r1, r2 |
78 |
lsl lr, r2, #1 |
79 |
vld1.64 {d0, d1}, [r1], lr |
80 |
vld1.64 {d2, d3}, [ip], lr |
81 |
1: subs r3, r3, #2 |
82 |
\vhadd q2, q0, q1 |
83 |
vld1.64 {d0, d1}, [r1], lr |
84 |
\vhadd q3, q0, q1 |
85 |
vld1.64 {d2, d3}, [ip], lr |
86 |
pld [r1] |
87 |
pld [ip] |
88 |
vst1.64 {d4, d5}, [r0,:128], r2 |
89 |
vst1.64 {d6, d7}, [r0,:128], r2 |
90 |
bne 1b |
91 |
pop {pc} |
92 |
.endm |
93 |
|
94 |
.macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 |
95 |
push {lr} |
96 |
lsl lr, r2, #1 |
97 |
add ip, r1, r2 |
98 |
vld1.64 {d0-d2}, [r1], lr |
99 |
vld1.64 {d4-d6}, [ip], lr |
100 |
.if \no_rnd |
101 |
vmov.i16 q13, #1 |
102 |
.endif |
103 |
pld [r1] |
104 |
pld [ip] |
105 |
vext.8 q1, q0, q1, #1 |
106 |
vext.8 q3, q2, q3, #1 |
107 |
vaddl.u8 q8, d0, d2 |
108 |
vaddl.u8 q10, d1, d3 |
109 |
vaddl.u8 q9, d4, d6 |
110 |
vaddl.u8 q11, d5, d7 |
111 |
1: subs r3, r3, #2 |
112 |
vld1.64 {d0-d2}, [r1], lr |
113 |
vadd.u16 q12, q8, q9 |
114 |
pld [r1] |
115 |
.if \no_rnd |
116 |
vadd.u16 q12, q12, q13 |
117 |
.endif |
118 |
vext.8 q15, q0, q1, #1 |
119 |
vadd.u16 q1 , q10, q11 |
120 |
\vshrn d28, q12, #2 |
121 |
.if \no_rnd |
122 |
vadd.u16 q1, q1, q13 |
123 |
.endif |
124 |
\vshrn d29, q1, #2 |
125 |
vaddl.u8 q8, d0, d30 |
126 |
vld1.64 {d2-d4}, [ip], lr |
127 |
vaddl.u8 q10, d1, d31 |
128 |
vst1.64 {d28,d29}, [r0,:128], r2 |
129 |
vadd.u16 q12, q8, q9 |
130 |
pld [ip] |
131 |
.if \no_rnd |
132 |
vadd.u16 q12, q12, q13 |
133 |
.endif |
134 |
vext.8 q2, q1, q2, #1 |
135 |
vadd.u16 q0, q10, q11 |
136 |
\vshrn d30, q12, #2 |
137 |
.if \no_rnd |
138 |
vadd.u16 q0, q0, q13 |
139 |
.endif |
140 |
\vshrn d31, q0, #2 |
141 |
vaddl.u8 q9, d2, d4 |
142 |
vaddl.u8 q11, d3, d5 |
143 |
vst1.64 {d30,d31}, [r0,:128], r2 |
144 |
bgt 1b |
145 |
pop {pc} |
146 |
.endm |
147 |
|
148 |
.macro pixels8 |
149 |
1: vld1.64 {d0}, [r1], r2 |
150 |
vld1.64 {d1}, [r1], r2 |
151 |
vld1.64 {d2}, [r1], r2 |
152 |
pld [r1, r2, lsl #2] |
153 |
vld1.64 {d3}, [r1], r2 |
154 |
pld [r1] |
155 |
pld [r1, r2] |
156 |
pld [r1, r2, lsl #1] |
157 |
subs r3, r3, #4 |
158 |
vst1.64 {d0}, [r0,:64], r2 |
159 |
vst1.64 {d1}, [r0,:64], r2 |
160 |
vst1.64 {d2}, [r0,:64], r2 |
161 |
vst1.64 {d3}, [r0,:64], r2 |
162 |
bne 1b |
163 |
bx lr |
164 |
.endm |
165 |
|
166 |
.macro pixels8_x2 vhadd=vrhadd.u8 |
167 |
1: vld1.64 {d0, d1}, [r1], r2 |
168 |
vext.8 d1, d0, d1, #1 |
169 |
vld1.64 {d2, d3}, [r1], r2 |
170 |
vext.8 d3, d2, d3, #1 |
171 |
pld [r1] |
172 |
pld [r1, r2] |
173 |
subs r3, r3, #2 |
174 |
vswp d1, d2 |
175 |
\vhadd q0, q0, q1 |
176 |
vst1.64 {d0}, [r0,:64], r2 |
177 |
vst1.64 {d1}, [r0,:64], r2 |
178 |
bne 1b |
179 |
bx lr |
180 |
.endm |
181 |
|
182 |
.macro pixels8_y2 vhadd=vrhadd.u8 |
183 |
push {lr} |
184 |
add ip, r1, r2 |
185 |
lsl lr, r2, #1 |
186 |
vld1.64 {d0}, [r1], lr |
187 |
vld1.64 {d1}, [ip], lr |
188 |
1: subs r3, r3, #2 |
189 |
\vhadd d4, d0, d1 |
190 |
vld1.64 {d0}, [r1], lr |
191 |
\vhadd d5, d0, d1 |
192 |
vld1.64 {d1}, [ip], lr |
193 |
pld [r1] |
194 |
pld [ip] |
195 |
vst1.64 {d4}, [r0,:64], r2 |
196 |
vst1.64 {d5}, [r0,:64], r2 |
197 |
bne 1b |
198 |
pop {pc} |
199 |
.endm |
200 |
|
201 |
.macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 |
202 |
push {lr} |
203 |
lsl lr, r2, #1 |
204 |
add ip, r1, r2 |
205 |
vld1.64 {d0, d1}, [r1], lr |
206 |
vld1.64 {d2, d3}, [ip], lr |
207 |
.if \no_rnd |
208 |
vmov.i16 q11, #1 |
209 |
.endif |
210 |
pld [r1] |
211 |
pld [ip] |
212 |
vext.8 d4, d0, d1, #1 |
213 |
vext.8 d6, d2, d3, #1 |
214 |
vaddl.u8 q8, d0, d4 |
215 |
vaddl.u8 q9, d2, d6 |
216 |
1: subs r3, r3, #2 |
217 |
vld1.64 {d0, d1}, [r1], lr |
218 |
pld [r1] |
219 |
vadd.u16 q10, q8, q9 |
220 |
vext.8 d4, d0, d1, #1 |
221 |
.if \no_rnd |
222 |
vadd.u16 q10, q10, q11 |
223 |
.endif |
224 |
vaddl.u8 q8, d0, d4 |
225 |
\vshrn d5, q10, #2 |
226 |
vld1.64 {d2, d3}, [ip], lr |
227 |
vadd.u16 q10, q8, q9 |
228 |
pld [ip] |
229 |
.if \no_rnd |
230 |
vadd.u16 q10, q10, q11 |
231 |
.endif |
232 |
vst1.64 {d5}, [r0,:64], r2 |
233 |
\vshrn d7, q10, #2 |
234 |
vext.8 d6, d2, d3, #1 |
235 |
vaddl.u8 q9, d2, d6 |
236 |
vst1.64 {d7}, [r0,:64], r2 |
237 |
bgt 1b |
238 |
pop {pc} |
239 |
.endm |
240 |
|
241 |
.macro pixfunc pfx name suf rnd_op args:vararg |
242 |
function ff_\pfx\name\suf\()_neon, export=1 |
243 |
\name \rnd_op \args |
244 |
.endfunc |
245 |
.endm |
246 |
|
247 |
.macro pixfunc2 pfx name args:vararg |
248 |
pixfunc \pfx \name |
249 |
pixfunc \pfx \name \args |
250 |
.endm |
251 |
|
252 |
function ff_put_h264_qpel16_mc00_neon, export=1 |
253 |
mov r3, #16 |
254 |
.endfunc |
255 |
|
256 |
pixfunc put_ pixels16 |
257 |
pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 |
258 |
pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 |
259 |
pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 |
260 |
|
261 |
function ff_avg_h264_qpel16_mc00_neon, export=1 |
262 |
mov r3, #16 |
263 |
.endfunc |
264 |
|
265 |
pixfunc avg_ pixels16,, 1 |
266 |
|
267 |
function ff_put_h264_qpel8_mc00_neon, export=1 |
268 |
mov r3, #8 |
269 |
.endfunc |
270 |
|
271 |
pixfunc put_ pixels8 |
272 |
pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 |
273 |
pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 |
274 |
pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 |