ffmpeg / libavcodec / ppc / h264_template_altivec.c @ ed040f35
History | View | Annotate | Download (26.8 KB)
1 |
/*
|
---|---|
2 |
* Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
|
3 |
*
|
4 |
* This file is part of FFmpeg.
|
5 |
*
|
6 |
* FFmpeg is free software; you can redistribute it and/or
|
7 |
* modify it under the terms of the GNU Lesser General Public
|
8 |
* License as published by the Free Software Foundation; either
|
9 |
* version 2.1 of the License, or (at your option) any later version.
|
10 |
*
|
11 |
* FFmpeg is distributed in the hope that it will be useful,
|
12 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
14 |
* Lesser General Public License for more details.
|
15 |
*
|
16 |
* You should have received a copy of the GNU Lesser General Public
|
17 |
* License along with FFmpeg; if not, write to the Free Software
|
18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
19 |
*/
|
20 |
|
21 |
//#define DEBUG_ALIGNMENT
|
22 |
#ifdef DEBUG_ALIGNMENT
|
23 |
#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); |
24 |
#else
|
25 |
#define ASSERT_ALIGNED(ptr) ;
|
26 |
#endif
|
27 |
|
28 |
/* this code assume that stride % 16 == 0 */
|
29 |
|
30 |
#define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
|
31 |
vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\ |
32 |
vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\ |
33 |
\ |
34 |
psum = vec_mladd(vA, vsrc0ssH, BIAS1);\ |
35 |
psum = vec_mladd(vB, vsrc1ssH, psum);\ |
36 |
psum = vec_mladd(vC, vsrc2ssH, psum);\ |
37 |
psum = vec_mladd(vD, vsrc3ssH, psum);\ |
38 |
psum = BIAS2(psum);\ |
39 |
psum = vec_sr(psum, v6us);\ |
40 |
\ |
41 |
vdst = vec_ld(0, dst);\
|
42 |
ppsum = (vec_u8)vec_pack(psum, psum);\ |
43 |
vfdst = vec_perm(vdst, ppsum, fperm);\ |
44 |
\ |
45 |
OP_U8_ALTIVEC(fsum, vfdst, vdst);\ |
46 |
\ |
47 |
vec_st(fsum, 0, dst);\
|
48 |
\ |
49 |
vsrc0ssH = vsrc2ssH;\ |
50 |
vsrc1ssH = vsrc3ssH;\ |
51 |
\ |
52 |
dst += stride;\ |
53 |
src += stride; |
54 |
|
55 |
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
|
56 |
\ |
57 |
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\ |
58 |
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\ |
59 |
\ |
60 |
psum = vec_mladd(vA, vsrc0ssH, v32ss);\ |
61 |
psum = vec_mladd(vE, vsrc1ssH, psum);\ |
62 |
psum = vec_sr(psum, v6us);\ |
63 |
\ |
64 |
vdst = vec_ld(0, dst);\
|
65 |
ppsum = (vec_u8)vec_pack(psum, psum);\ |
66 |
vfdst = vec_perm(vdst, ppsum, fperm);\ |
67 |
\ |
68 |
OP_U8_ALTIVEC(fsum, vfdst, vdst);\ |
69 |
\ |
70 |
vec_st(fsum, 0, dst);\
|
71 |
\ |
72 |
dst += stride;\ |
73 |
src += stride; |
74 |
|
75 |
#define noop(a) a
|
76 |
#define add28(a) vec_add(v28ss, a)
|
77 |
|
78 |
#ifdef PREFIX_h264_chroma_mc8_altivec
|
79 |
static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, |
80 |
int stride, int h, int x, int y) { |
81 |
DECLARE_ALIGNED(16, signed int, ABCD)[4] = |
82 |
{((8 - x) * (8 - y)), |
83 |
(( x) * (8 - y)),
|
84 |
((8 - x) * ( y)),
|
85 |
(( x) * ( y))}; |
86 |
register int i; |
87 |
vec_u8 fperm; |
88 |
const vec_s32 vABCD = vec_ld(0, ABCD); |
89 |
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); |
90 |
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); |
91 |
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); |
92 |
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); |
93 |
LOAD_ZERO; |
94 |
const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); |
95 |
const vec_u16 v6us = vec_splat_u16(6); |
96 |
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
97 |
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
98 |
|
99 |
vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; |
100 |
vec_u8 vsrc0uc, vsrc1uc; |
101 |
vec_s16 vsrc0ssH, vsrc1ssH; |
102 |
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; |
103 |
vec_s16 vsrc2ssH, vsrc3ssH, psum; |
104 |
vec_u8 vdst, ppsum, vfdst, fsum; |
105 |
|
106 |
if (((unsigned long)dst) % 16 == 0) { |
107 |
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, |
108 |
0x14, 0x15, 0x16, 0x17, |
109 |
0x08, 0x09, 0x0A, 0x0B, |
110 |
0x0C, 0x0D, 0x0E, 0x0F}; |
111 |
} else {
|
112 |
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, |
113 |
0x04, 0x05, 0x06, 0x07, |
114 |
0x18, 0x19, 0x1A, 0x1B, |
115 |
0x1C, 0x1D, 0x1E, 0x1F}; |
116 |
} |
117 |
|
118 |
vsrcAuc = vec_ld(0, src);
|
119 |
|
120 |
if (loadSecond)
|
121 |
vsrcBuc = vec_ld(16, src);
|
122 |
vsrcperm0 = vec_lvsl(0, src);
|
123 |
vsrcperm1 = vec_lvsl(1, src);
|
124 |
|
125 |
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); |
126 |
if (reallyBadAlign)
|
127 |
vsrc1uc = vsrcBuc; |
128 |
else
|
129 |
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
130 |
|
131 |
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc); |
132 |
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc); |
133 |
|
134 |
if (ABCD[3]) { |
135 |
if (!loadSecond) {// -> !reallyBadAlign |
136 |
for (i = 0 ; i < h ; i++) { |
137 |
vsrcCuc = vec_ld(stride + 0, src);
|
138 |
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
139 |
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
140 |
|
141 |
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) |
142 |
} |
143 |
} else {
|
144 |
vec_u8 vsrcDuc; |
145 |
for (i = 0 ; i < h ; i++) { |
146 |
vsrcCuc = vec_ld(stride + 0, src);
|
147 |
vsrcDuc = vec_ld(stride + 16, src);
|
148 |
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
149 |
if (reallyBadAlign)
|
150 |
vsrc3uc = vsrcDuc; |
151 |
else
|
152 |
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
153 |
|
154 |
CHROMA_MC8_ALTIVEC_CORE(v32ss, noop) |
155 |
} |
156 |
} |
157 |
} else {
|
158 |
const vec_s16 vE = vec_add(vB, vC);
|
159 |
if (ABCD[2]) { // x == 0 B == 0 |
160 |
if (!loadSecond) {// -> !reallyBadAlign |
161 |
for (i = 0 ; i < h ; i++) { |
162 |
vsrcCuc = vec_ld(stride + 0, src);
|
163 |
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
164 |
CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
165 |
|
166 |
vsrc0uc = vsrc1uc; |
167 |
} |
168 |
} else {
|
169 |
vec_u8 vsrcDuc; |
170 |
for (i = 0 ; i < h ; i++) { |
171 |
vsrcCuc = vec_ld(stride + 0, src);
|
172 |
vsrcDuc = vec_ld(stride + 15, src);
|
173 |
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
174 |
CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
175 |
|
176 |
vsrc0uc = vsrc1uc; |
177 |
} |
178 |
} |
179 |
} else { // y == 0 C == 0 |
180 |
if (!loadSecond) {// -> !reallyBadAlign |
181 |
for (i = 0 ; i < h ; i++) { |
182 |
vsrcCuc = vec_ld(0, src);
|
183 |
vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
184 |
vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
185 |
|
186 |
CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
187 |
} |
188 |
} else {
|
189 |
vec_u8 vsrcDuc; |
190 |
for (i = 0 ; i < h ; i++) { |
191 |
vsrcCuc = vec_ld(0, src);
|
192 |
vsrcDuc = vec_ld(15, src);
|
193 |
vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
194 |
if (reallyBadAlign)
|
195 |
vsrc1uc = vsrcDuc; |
196 |
else
|
197 |
vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
198 |
|
199 |
CHROMA_MC8_ALTIVEC_CORE_SIMPLE |
200 |
} |
201 |
} |
202 |
} |
203 |
} |
204 |
} |
205 |
#endif
|
206 |
|
207 |
/* this code assume that stride % 16 == 0 */
|
208 |
#ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
|
209 |
static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { |
210 |
DECLARE_ALIGNED(16, signed int, ABCD)[4] = |
211 |
{((8 - x) * (8 - y)), |
212 |
(( x) * (8 - y)),
|
213 |
((8 - x) * ( y)),
|
214 |
(( x) * ( y))}; |
215 |
register int i; |
216 |
vec_u8 fperm; |
217 |
const vec_s32 vABCD = vec_ld(0, ABCD); |
218 |
const vec_s16 vA = vec_splat((vec_s16)vABCD, 1); |
219 |
const vec_s16 vB = vec_splat((vec_s16)vABCD, 3); |
220 |
const vec_s16 vC = vec_splat((vec_s16)vABCD, 5); |
221 |
const vec_s16 vD = vec_splat((vec_s16)vABCD, 7); |
222 |
LOAD_ZERO; |
223 |
const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); |
224 |
const vec_u16 v6us = vec_splat_u16(6); |
225 |
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; |
226 |
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; |
227 |
|
228 |
vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1; |
229 |
vec_u8 vsrc0uc, vsrc1uc; |
230 |
vec_s16 vsrc0ssH, vsrc1ssH; |
231 |
vec_u8 vsrcCuc, vsrc2uc, vsrc3uc; |
232 |
vec_s16 vsrc2ssH, vsrc3ssH, psum; |
233 |
vec_u8 vdst, ppsum, vfdst, fsum; |
234 |
|
235 |
if (((unsigned long)dst) % 16 == 0) { |
236 |
fperm = (vec_u8){0x10, 0x11, 0x12, 0x13, |
237 |
0x14, 0x15, 0x16, 0x17, |
238 |
0x08, 0x09, 0x0A, 0x0B, |
239 |
0x0C, 0x0D, 0x0E, 0x0F}; |
240 |
} else {
|
241 |
fperm = (vec_u8){0x00, 0x01, 0x02, 0x03, |
242 |
0x04, 0x05, 0x06, 0x07, |
243 |
0x18, 0x19, 0x1A, 0x1B, |
244 |
0x1C, 0x1D, 0x1E, 0x1F}; |
245 |
} |
246 |
|
247 |
vsrcAuc = vec_ld(0, src);
|
248 |
|
249 |
if (loadSecond)
|
250 |
vsrcBuc = vec_ld(16, src);
|
251 |
vsrcperm0 = vec_lvsl(0, src);
|
252 |
vsrcperm1 = vec_lvsl(1, src);
|
253 |
|
254 |
vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); |
255 |
if (reallyBadAlign)
|
256 |
vsrc1uc = vsrcBuc; |
257 |
else
|
258 |
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); |
259 |
|
260 |
vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc); |
261 |
vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc); |
262 |
|
263 |
if (!loadSecond) {// -> !reallyBadAlign |
264 |
for (i = 0 ; i < h ; i++) { |
265 |
|
266 |
|
267 |
vsrcCuc = vec_ld(stride + 0, src);
|
268 |
|
269 |
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); |
270 |
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); |
271 |
|
272 |
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
|
273 |
} |
274 |
} else {
|
275 |
vec_u8 vsrcDuc; |
276 |
for (i = 0 ; i < h ; i++) { |
277 |
vsrcCuc = vec_ld(stride + 0, src);
|
278 |
vsrcDuc = vec_ld(stride + 16, src);
|
279 |
|
280 |
vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); |
281 |
if (reallyBadAlign)
|
282 |
vsrc3uc = vsrcDuc; |
283 |
else
|
284 |
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); |
285 |
|
286 |
CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
|
287 |
} |
288 |
} |
289 |
} |
290 |
#endif
|
291 |
|
292 |
#undef noop
|
293 |
#undef add28
|
294 |
#undef CHROMA_MC8_ALTIVEC_CORE
|
295 |
|
296 |
/* this code assume stride % 16 == 0 */
|
297 |
#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
|
298 |
static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
299 |
register int i; |
300 |
|
301 |
LOAD_ZERO; |
302 |
const vec_u8 permM2 = vec_lvsl(-2, src); |
303 |
const vec_u8 permM1 = vec_lvsl(-1, src); |
304 |
const vec_u8 permP0 = vec_lvsl(+0, src); |
305 |
const vec_u8 permP1 = vec_lvsl(+1, src); |
306 |
const vec_u8 permP2 = vec_lvsl(+2, src); |
307 |
const vec_u8 permP3 = vec_lvsl(+3, src); |
308 |
const vec_s16 v5ss = vec_splat_s16(5); |
309 |
const vec_u16 v5us = vec_splat_u16(5); |
310 |
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
311 |
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
312 |
|
313 |
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
314 |
|
315 |
register int align = ((((unsigned long)src) - 2) % 16); |
316 |
|
317 |
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
318 |
srcP2A, srcP2B, srcP3A, srcP3B, |
319 |
srcM1A, srcM1B, srcM2A, srcM2B, |
320 |
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
321 |
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
322 |
psumA, psumB, sumA, sumB; |
323 |
|
324 |
vec_u8 sum, vdst, fsum; |
325 |
|
326 |
for (i = 0 ; i < 16 ; i ++) { |
327 |
vec_u8 srcR1 = vec_ld(-2, src);
|
328 |
vec_u8 srcR2 = vec_ld(14, src);
|
329 |
|
330 |
switch (align) {
|
331 |
default: {
|
332 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
333 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
334 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
335 |
srcP1 = vec_perm(srcR1, srcR2, permP1); |
336 |
srcP2 = vec_perm(srcR1, srcR2, permP2); |
337 |
srcP3 = vec_perm(srcR1, srcR2, permP3); |
338 |
} break;
|
339 |
case 11: { |
340 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
341 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
342 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
343 |
srcP1 = vec_perm(srcR1, srcR2, permP1); |
344 |
srcP2 = vec_perm(srcR1, srcR2, permP2); |
345 |
srcP3 = srcR2; |
346 |
} break;
|
347 |
case 12: { |
348 |
vec_u8 srcR3 = vec_ld(30, src);
|
349 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
350 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
351 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
352 |
srcP1 = vec_perm(srcR1, srcR2, permP1); |
353 |
srcP2 = srcR2; |
354 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
355 |
} break;
|
356 |
case 13: { |
357 |
vec_u8 srcR3 = vec_ld(30, src);
|
358 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
359 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
360 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
361 |
srcP1 = srcR2; |
362 |
srcP2 = vec_perm(srcR2, srcR3, permP2); |
363 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
364 |
} break;
|
365 |
case 14: { |
366 |
vec_u8 srcR3 = vec_ld(30, src);
|
367 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
368 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
369 |
srcP0 = srcR2; |
370 |
srcP1 = vec_perm(srcR2, srcR3, permP1); |
371 |
srcP2 = vec_perm(srcR2, srcR3, permP2); |
372 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
373 |
} break;
|
374 |
case 15: { |
375 |
vec_u8 srcR3 = vec_ld(30, src);
|
376 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
377 |
srcM1 = srcR2; |
378 |
srcP0 = vec_perm(srcR2, srcR3, permP0); |
379 |
srcP1 = vec_perm(srcR2, srcR3, permP1); |
380 |
srcP2 = vec_perm(srcR2, srcR3, permP2); |
381 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
382 |
} break;
|
383 |
} |
384 |
|
385 |
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
386 |
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); |
387 |
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
388 |
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); |
389 |
|
390 |
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
391 |
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); |
392 |
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
393 |
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); |
394 |
|
395 |
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
396 |
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); |
397 |
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
398 |
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); |
399 |
|
400 |
sum1A = vec_adds(srcP0A, srcP1A); |
401 |
sum1B = vec_adds(srcP0B, srcP1B); |
402 |
sum2A = vec_adds(srcM1A, srcP2A); |
403 |
sum2B = vec_adds(srcM1B, srcP2B); |
404 |
sum3A = vec_adds(srcM2A, srcP3A); |
405 |
sum3B = vec_adds(srcM2B, srcP3B); |
406 |
|
407 |
pp1A = vec_mladd(sum1A, v20ss, v16ss); |
408 |
pp1B = vec_mladd(sum1B, v20ss, v16ss); |
409 |
|
410 |
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
411 |
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
412 |
|
413 |
pp3A = vec_add(sum3A, pp1A); |
414 |
pp3B = vec_add(sum3B, pp1B); |
415 |
|
416 |
psumA = vec_sub(pp3A, pp2A); |
417 |
psumB = vec_sub(pp3B, pp2B); |
418 |
|
419 |
sumA = vec_sra(psumA, v5us); |
420 |
sumB = vec_sra(psumB, v5us); |
421 |
|
422 |
sum = vec_packsu(sumA, sumB); |
423 |
|
424 |
ASSERT_ALIGNED(dst); |
425 |
vdst = vec_ld(0, dst);
|
426 |
|
427 |
OP_U8_ALTIVEC(fsum, sum, vdst); |
428 |
|
429 |
vec_st(fsum, 0, dst);
|
430 |
|
431 |
src += srcStride; |
432 |
dst += dstStride; |
433 |
} |
434 |
} |
435 |
#endif
|
436 |
|
437 |
/* this code assume stride % 16 == 0 */
|
438 |
#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
|
439 |
static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { |
440 |
register int i; |
441 |
|
442 |
LOAD_ZERO; |
443 |
const vec_u8 perm = vec_lvsl(0, src); |
444 |
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
445 |
const vec_u16 v5us = vec_splat_u16(5); |
446 |
const vec_s16 v5ss = vec_splat_s16(5); |
447 |
const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); |
448 |
|
449 |
uint8_t *srcbis = src - (srcStride * 2);
|
450 |
|
451 |
const vec_u8 srcM2a = vec_ld(0, srcbis); |
452 |
const vec_u8 srcM2b = vec_ld(16, srcbis); |
453 |
const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
|
454 |
//srcbis += srcStride;
|
455 |
const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride); |
456 |
const vec_u8 srcM1b = vec_ld(16, srcbis); |
457 |
const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
|
458 |
//srcbis += srcStride;
|
459 |
const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride); |
460 |
const vec_u8 srcP0b = vec_ld(16, srcbis); |
461 |
const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
|
462 |
//srcbis += srcStride;
|
463 |
const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride); |
464 |
const vec_u8 srcP1b = vec_ld(16, srcbis); |
465 |
const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
|
466 |
//srcbis += srcStride;
|
467 |
const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride); |
468 |
const vec_u8 srcP2b = vec_ld(16, srcbis); |
469 |
const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
|
470 |
//srcbis += srcStride;
|
471 |
|
472 |
vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
473 |
vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2); |
474 |
vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
475 |
vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1); |
476 |
vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
477 |
vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0); |
478 |
vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
479 |
vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1); |
480 |
vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
481 |
vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2); |
482 |
|
483 |
vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, |
484 |
psumA, psumB, sumA, sumB, |
485 |
srcP3ssA, srcP3ssB, |
486 |
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; |
487 |
|
488 |
vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3; |
489 |
|
490 |
for (i = 0 ; i < 16 ; i++) { |
491 |
srcP3a = vec_ld(0, srcbis += srcStride);
|
492 |
srcP3b = vec_ld(16, srcbis);
|
493 |
srcP3 = vec_perm(srcP3a, srcP3b, perm); |
494 |
srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
495 |
srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3); |
496 |
//srcbis += srcStride;
|
497 |
|
498 |
sum1A = vec_adds(srcP0ssA, srcP1ssA); |
499 |
sum1B = vec_adds(srcP0ssB, srcP1ssB); |
500 |
sum2A = vec_adds(srcM1ssA, srcP2ssA); |
501 |
sum2B = vec_adds(srcM1ssB, srcP2ssB); |
502 |
sum3A = vec_adds(srcM2ssA, srcP3ssA); |
503 |
sum3B = vec_adds(srcM2ssB, srcP3ssB); |
504 |
|
505 |
srcM2ssA = srcM1ssA; |
506 |
srcM2ssB = srcM1ssB; |
507 |
srcM1ssA = srcP0ssA; |
508 |
srcM1ssB = srcP0ssB; |
509 |
srcP0ssA = srcP1ssA; |
510 |
srcP0ssB = srcP1ssB; |
511 |
srcP1ssA = srcP2ssA; |
512 |
srcP1ssB = srcP2ssB; |
513 |
srcP2ssA = srcP3ssA; |
514 |
srcP2ssB = srcP3ssB; |
515 |
|
516 |
pp1A = vec_mladd(sum1A, v20ss, v16ss); |
517 |
pp1B = vec_mladd(sum1B, v20ss, v16ss); |
518 |
|
519 |
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
520 |
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
521 |
|
522 |
pp3A = vec_add(sum3A, pp1A); |
523 |
pp3B = vec_add(sum3B, pp1B); |
524 |
|
525 |
psumA = vec_sub(pp3A, pp2A); |
526 |
psumB = vec_sub(pp3B, pp2B); |
527 |
|
528 |
sumA = vec_sra(psumA, v5us); |
529 |
sumB = vec_sra(psumB, v5us); |
530 |
|
531 |
sum = vec_packsu(sumA, sumB); |
532 |
|
533 |
ASSERT_ALIGNED(dst); |
534 |
vdst = vec_ld(0, dst);
|
535 |
|
536 |
OP_U8_ALTIVEC(fsum, sum, vdst); |
537 |
|
538 |
vec_st(fsum, 0, dst);
|
539 |
|
540 |
dst += dstStride; |
541 |
} |
542 |
} |
543 |
#endif
|
544 |
|
545 |
/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
|
546 |
#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
|
547 |
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { |
548 |
register int i; |
549 |
LOAD_ZERO; |
550 |
const vec_u8 permM2 = vec_lvsl(-2, src); |
551 |
const vec_u8 permM1 = vec_lvsl(-1, src); |
552 |
const vec_u8 permP0 = vec_lvsl(+0, src); |
553 |
const vec_u8 permP1 = vec_lvsl(+1, src); |
554 |
const vec_u8 permP2 = vec_lvsl(+2, src); |
555 |
const vec_u8 permP3 = vec_lvsl(+3, src); |
556 |
const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); |
557 |
const vec_u32 v10ui = vec_splat_u32(10); |
558 |
const vec_s16 v5ss = vec_splat_s16(5); |
559 |
const vec_s16 v1ss = vec_splat_s16(1); |
560 |
const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); |
561 |
const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); |
562 |
|
563 |
register int align = ((((unsigned long)src) - 2) % 16); |
564 |
|
565 |
vec_s16 srcP0A, srcP0B, srcP1A, srcP1B, |
566 |
srcP2A, srcP2B, srcP3A, srcP3B, |
567 |
srcM1A, srcM1B, srcM2A, srcM2B, |
568 |
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, |
569 |
pp1A, pp1B, pp2A, pp2B, psumA, psumB; |
570 |
|
571 |
const vec_u8 mperm = (const vec_u8) |
572 |
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, |
573 |
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; |
574 |
int16_t *tmpbis = tmp; |
575 |
|
576 |
vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, |
577 |
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, |
578 |
tmpP2ssA, tmpP2ssB; |
579 |
|
580 |
vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, |
581 |
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, |
582 |
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, |
583 |
ssumAe, ssumAo, ssumBe, ssumBo; |
584 |
vec_u8 fsum, sumv, sum, vdst; |
585 |
vec_s16 ssume, ssumo; |
586 |
|
587 |
src -= (2 * srcStride);
|
588 |
for (i = 0 ; i < 21 ; i ++) { |
589 |
vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; |
590 |
vec_u8 srcR1 = vec_ld(-2, src);
|
591 |
vec_u8 srcR2 = vec_ld(14, src);
|
592 |
|
593 |
switch (align) {
|
594 |
default: {
|
595 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
596 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
597 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
598 |
srcP1 = vec_perm(srcR1, srcR2, permP1); |
599 |
srcP2 = vec_perm(srcR1, srcR2, permP2); |
600 |
srcP3 = vec_perm(srcR1, srcR2, permP3); |
601 |
} break;
|
602 |
case 11: { |
603 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
604 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
605 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
606 |
srcP1 = vec_perm(srcR1, srcR2, permP1); |
607 |
srcP2 = vec_perm(srcR1, srcR2, permP2); |
608 |
srcP3 = srcR2; |
609 |
} break;
|
610 |
case 12: { |
611 |
vec_u8 srcR3 = vec_ld(30, src);
|
612 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
613 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
614 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
615 |
srcP1 = vec_perm(srcR1, srcR2, permP1); |
616 |
srcP2 = srcR2; |
617 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
618 |
} break;
|
619 |
case 13: { |
620 |
vec_u8 srcR3 = vec_ld(30, src);
|
621 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
622 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
623 |
srcP0 = vec_perm(srcR1, srcR2, permP0); |
624 |
srcP1 = srcR2; |
625 |
srcP2 = vec_perm(srcR2, srcR3, permP2); |
626 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
627 |
} break;
|
628 |
case 14: { |
629 |
vec_u8 srcR3 = vec_ld(30, src);
|
630 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
631 |
srcM1 = vec_perm(srcR1, srcR2, permM1); |
632 |
srcP0 = srcR2; |
633 |
srcP1 = vec_perm(srcR2, srcR3, permP1); |
634 |
srcP2 = vec_perm(srcR2, srcR3, permP2); |
635 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
636 |
} break;
|
637 |
case 15: { |
638 |
vec_u8 srcR3 = vec_ld(30, src);
|
639 |
srcM2 = vec_perm(srcR1, srcR2, permM2); |
640 |
srcM1 = srcR2; |
641 |
srcP0 = vec_perm(srcR2, srcR3, permP0); |
642 |
srcP1 = vec_perm(srcR2, srcR3, permP1); |
643 |
srcP2 = vec_perm(srcR2, srcR3, permP2); |
644 |
srcP3 = vec_perm(srcR2, srcR3, permP3); |
645 |
} break;
|
646 |
} |
647 |
|
648 |
srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0); |
649 |
srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0); |
650 |
srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1); |
651 |
srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1); |
652 |
|
653 |
srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2); |
654 |
srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2); |
655 |
srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3); |
656 |
srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3); |
657 |
|
658 |
srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1); |
659 |
srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1); |
660 |
srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2); |
661 |
srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2); |
662 |
|
663 |
sum1A = vec_adds(srcP0A, srcP1A); |
664 |
sum1B = vec_adds(srcP0B, srcP1B); |
665 |
sum2A = vec_adds(srcM1A, srcP2A); |
666 |
sum2B = vec_adds(srcM1B, srcP2B); |
667 |
sum3A = vec_adds(srcM2A, srcP3A); |
668 |
sum3B = vec_adds(srcM2B, srcP3B); |
669 |
|
670 |
pp1A = vec_mladd(sum1A, v20ss, sum3A); |
671 |
pp1B = vec_mladd(sum1B, v20ss, sum3B); |
672 |
|
673 |
pp2A = vec_mladd(sum2A, v5ss, zero_s16v); |
674 |
pp2B = vec_mladd(sum2B, v5ss, zero_s16v); |
675 |
|
676 |
psumA = vec_sub(pp1A, pp2A); |
677 |
psumB = vec_sub(pp1B, pp2B); |
678 |
|
679 |
vec_st(psumA, 0, tmp);
|
680 |
vec_st(psumB, 16, tmp);
|
681 |
|
682 |
src += srcStride; |
683 |
tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
|
684 |
} |
685 |
|
686 |
tmpM2ssA = vec_ld(0, tmpbis);
|
687 |
tmpM2ssB = vec_ld(16, tmpbis);
|
688 |
tmpbis += tmpStride; |
689 |
tmpM1ssA = vec_ld(0, tmpbis);
|
690 |
tmpM1ssB = vec_ld(16, tmpbis);
|
691 |
tmpbis += tmpStride; |
692 |
tmpP0ssA = vec_ld(0, tmpbis);
|
693 |
tmpP0ssB = vec_ld(16, tmpbis);
|
694 |
tmpbis += tmpStride; |
695 |
tmpP1ssA = vec_ld(0, tmpbis);
|
696 |
tmpP1ssB = vec_ld(16, tmpbis);
|
697 |
tmpbis += tmpStride; |
698 |
tmpP2ssA = vec_ld(0, tmpbis);
|
699 |
tmpP2ssB = vec_ld(16, tmpbis);
|
700 |
tmpbis += tmpStride; |
701 |
|
702 |
for (i = 0 ; i < 16 ; i++) { |
703 |
const vec_s16 tmpP3ssA = vec_ld(0, tmpbis); |
704 |
const vec_s16 tmpP3ssB = vec_ld(16, tmpbis); |
705 |
|
706 |
const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
|
707 |
const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
|
708 |
const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
|
709 |
const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
|
710 |
const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
|
711 |
const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
|
712 |
|
713 |
tmpbis += tmpStride; |
714 |
|
715 |
tmpM2ssA = tmpM1ssA; |
716 |
tmpM2ssB = tmpM1ssB; |
717 |
tmpM1ssA = tmpP0ssA; |
718 |
tmpM1ssB = tmpP0ssB; |
719 |
tmpP0ssA = tmpP1ssA; |
720 |
tmpP0ssB = tmpP1ssB; |
721 |
tmpP1ssA = tmpP2ssA; |
722 |
tmpP1ssB = tmpP2ssB; |
723 |
tmpP2ssA = tmpP3ssA; |
724 |
tmpP2ssB = tmpP3ssB; |
725 |
|
726 |
pp1Ae = vec_mule(sum1A, v20ss); |
727 |
pp1Ao = vec_mulo(sum1A, v20ss); |
728 |
pp1Be = vec_mule(sum1B, v20ss); |
729 |
pp1Bo = vec_mulo(sum1B, v20ss); |
730 |
|
731 |
pp2Ae = vec_mule(sum2A, v5ss); |
732 |
pp2Ao = vec_mulo(sum2A, v5ss); |
733 |
pp2Be = vec_mule(sum2B, v5ss); |
734 |
pp2Bo = vec_mulo(sum2B, v5ss); |
735 |
|
736 |
pp3Ae = vec_sra((vec_s32)sum3A, v16ui); |
737 |
pp3Ao = vec_mulo(sum3A, v1ss); |
738 |
pp3Be = vec_sra((vec_s32)sum3B, v16ui); |
739 |
pp3Bo = vec_mulo(sum3B, v1ss); |
740 |
|
741 |
pp1cAe = vec_add(pp1Ae, v512si); |
742 |
pp1cAo = vec_add(pp1Ao, v512si); |
743 |
pp1cBe = vec_add(pp1Be, v512si); |
744 |
pp1cBo = vec_add(pp1Bo, v512si); |
745 |
|
746 |
pp32Ae = vec_sub(pp3Ae, pp2Ae); |
747 |
pp32Ao = vec_sub(pp3Ao, pp2Ao); |
748 |
pp32Be = vec_sub(pp3Be, pp2Be); |
749 |
pp32Bo = vec_sub(pp3Bo, pp2Bo); |
750 |
|
751 |
sumAe = vec_add(pp1cAe, pp32Ae); |
752 |
sumAo = vec_add(pp1cAo, pp32Ao); |
753 |
sumBe = vec_add(pp1cBe, pp32Be); |
754 |
sumBo = vec_add(pp1cBo, pp32Bo); |
755 |
|
756 |
ssumAe = vec_sra(sumAe, v10ui); |
757 |
ssumAo = vec_sra(sumAo, v10ui); |
758 |
ssumBe = vec_sra(sumBe, v10ui); |
759 |
ssumBo = vec_sra(sumBo, v10ui); |
760 |
|
761 |
ssume = vec_packs(ssumAe, ssumBe); |
762 |
ssumo = vec_packs(ssumAo, ssumBo); |
763 |
|
764 |
sumv = vec_packsu(ssume, ssumo); |
765 |
sum = vec_perm(sumv, sumv, mperm); |
766 |
|
767 |
ASSERT_ALIGNED(dst); |
768 |
vdst = vec_ld(0, dst);
|
769 |
|
770 |
OP_U8_ALTIVEC(fsum, sum, vdst); |
771 |
|
772 |
vec_st(fsum, 0, dst);
|
773 |
|
774 |
dst += dstStride; |
775 |
} |
776 |
} |
777 |
#endif
|