ffmpeg / libavcodec / ppc / vc1dsp_altivec.c @ 2912e87a
History | View | Annotate | Download (13.9 KB)
1 |
/*
|
---|---|
2 |
* VC-1 and WMV3 decoder - DSP functions AltiVec-optimized
|
3 |
* Copyright (c) 2006 Konstantin Shishkov
|
4 |
*
|
5 |
* This file is part of Libav.
|
6 |
*
|
7 |
* Libav is free software; you can redistribute it and/or
|
8 |
* modify it under the terms of the GNU Lesser General Public
|
9 |
* License as published by the Free Software Foundation; either
|
10 |
* version 2.1 of the License, or (at your option) any later version.
|
11 |
*
|
12 |
* Libav is distributed in the hope that it will be useful,
|
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
15 |
* Lesser General Public License for more details.
|
16 |
*
|
17 |
* You should have received a copy of the GNU Lesser General Public
|
18 |
* License along with Libav; if not, write to the Free Software
|
19 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
20 |
*/
|
21 |
|
22 |
#include "libavcodec/dsputil.h" |
23 |
#include "libavcodec/vc1dsp.h" |
24 |
|
25 |
#include "util_altivec.h" |
26 |
#include "dsputil_altivec.h" |
27 |
|
28 |
// main steps of 8x8 transform
|
29 |
#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \
|
30 |
do { \
|
31 |
t0 = vec_sl(vec_add(s0, s4), vec_2); \ |
32 |
t0 = vec_add(vec_sl(t0, vec_1), t0); \ |
33 |
t0 = vec_add(t0, vec_rnd); \ |
34 |
t1 = vec_sl(vec_sub(s0, s4), vec_2); \ |
35 |
t1 = vec_add(vec_sl(t1, vec_1), t1); \ |
36 |
t1 = vec_add(t1, vec_rnd); \ |
37 |
t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ |
38 |
t2 = vec_add(t2, vec_sl(s2, vec_4)); \ |
39 |
t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ |
40 |
t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ |
41 |
t4 = vec_add(t0, t2); \ |
42 |
t5 = vec_add(t1, t3); \ |
43 |
t6 = vec_sub(t1, t3); \ |
44 |
t7 = vec_sub(t0, t2); \ |
45 |
\ |
46 |
t0 = vec_sl(vec_add(s1, s3), vec_4); \ |
47 |
t0 = vec_add(t0, vec_sl(s5, vec_3)); \ |
48 |
t0 = vec_add(t0, vec_sl(s7, vec_2)); \ |
49 |
t0 = vec_add(t0, vec_sub(s5, s3)); \ |
50 |
\ |
51 |
t1 = vec_sl(vec_sub(s1, s5), vec_4); \ |
52 |
t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ |
53 |
t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ |
54 |
t1 = vec_sub(t1, vec_add(s1, s7)); \ |
55 |
\ |
56 |
t2 = vec_sl(vec_sub(s7, s3), vec_4); \ |
57 |
t2 = vec_add(t2, vec_sl(s1, vec_3)); \ |
58 |
t2 = vec_add(t2, vec_sl(s5, vec_2)); \ |
59 |
t2 = vec_add(t2, vec_sub(s1, s7)); \ |
60 |
\ |
61 |
t3 = vec_sl(vec_sub(s5, s7), vec_4); \ |
62 |
t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ |
63 |
t3 = vec_add(t3, vec_sl(s1, vec_2)); \ |
64 |
t3 = vec_sub(t3, vec_add(s3, s5)); \ |
65 |
\ |
66 |
s0 = vec_add(t4, t0); \ |
67 |
s1 = vec_add(t5, t1); \ |
68 |
s2 = vec_add(t6, t2); \ |
69 |
s3 = vec_add(t7, t3); \ |
70 |
s4 = vec_sub(t7, t3); \ |
71 |
s5 = vec_sub(t6, t2); \ |
72 |
s6 = vec_sub(t5, t1); \ |
73 |
s7 = vec_sub(t4, t0); \ |
74 |
}while(0) |
75 |
|
76 |
#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \
|
77 |
do { \
|
78 |
s0 = vec_sra(s0, vec_3); \ |
79 |
s1 = vec_sra(s1, vec_3); \ |
80 |
s2 = vec_sra(s2, vec_3); \ |
81 |
s3 = vec_sra(s3, vec_3); \ |
82 |
s4 = vec_sra(s4, vec_3); \ |
83 |
s5 = vec_sra(s5, vec_3); \ |
84 |
s6 = vec_sra(s6, vec_3); \ |
85 |
s7 = vec_sra(s7, vec_3); \ |
86 |
}while(0) |
87 |
|
88 |
#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \
|
89 |
do { \
|
90 |
s0 = vec_sra(s0, vec_7); \ |
91 |
s1 = vec_sra(s1, vec_7); \ |
92 |
s2 = vec_sra(s2, vec_7); \ |
93 |
s3 = vec_sra(s3, vec_7); \ |
94 |
s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ |
95 |
s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ |
96 |
s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ |
97 |
s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ |
98 |
}while(0) |
99 |
|
100 |
/* main steps of 4x4 transform */
|
101 |
#define STEP4(s0, s1, s2, s3, vec_rnd) \
|
102 |
do { \
|
103 |
t1 = vec_add(vec_sl(s0, vec_4), s0); \ |
104 |
t1 = vec_add(t1, vec_rnd); \ |
105 |
t2 = vec_add(vec_sl(s2, vec_4), s2); \ |
106 |
t0 = vec_add(t1, t2); \ |
107 |
t1 = vec_sub(t1, t2); \ |
108 |
t3 = vec_sl(vec_sub(s3, s1), vec_1); \ |
109 |
t3 = vec_add(t3, vec_sl(t3, vec_2)); \ |
110 |
t2 = vec_add(t3, vec_sl(s1, vec_5)); \ |
111 |
t3 = vec_add(t3, vec_sl(s3, vec_3)); \ |
112 |
t3 = vec_add(t3, vec_sl(s3, vec_2)); \ |
113 |
s0 = vec_add(t0, t2); \ |
114 |
s1 = vec_sub(t1, t3); \ |
115 |
s2 = vec_add(t1, t3); \ |
116 |
s3 = vec_sub(t0, t2); \ |
117 |
}while (0) |
118 |
|
119 |
#define SHIFT_HOR4(s0, s1, s2, s3) \
|
120 |
s0 = vec_sra(s0, vec_3); \ |
121 |
s1 = vec_sra(s1, vec_3); \ |
122 |
s2 = vec_sra(s2, vec_3); \ |
123 |
s3 = vec_sra(s3, vec_3); |
124 |
|
125 |
#define SHIFT_VERT4(s0, s1, s2, s3) \
|
126 |
s0 = vec_sra(s0, vec_7); \ |
127 |
s1 = vec_sra(s1, vec_7); \ |
128 |
s2 = vec_sra(s2, vec_7); \ |
129 |
s3 = vec_sra(s3, vec_7); |
130 |
|
131 |
/** Do inverse transform on 8x8 block
|
132 |
*/
|
133 |
static void vc1_inv_trans_8x8_altivec(DCTELEM block[64], |
134 |
int sign, int rangered) |
135 |
{ |
136 |
vector signed short src0, src1, src2, src3, src4, src5, src6, src7; |
137 |
vector signed int s0, s1, s2, s3, s4, s5, s6, s7; |
138 |
vector signed int s8, s9, sA, sB, sC, sD, sE, sF; |
139 |
vector signed int t0, t1, t2, t3, t4, t5, t6, t7; |
140 |
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); |
141 |
const vector unsigned int vec_7 = vec_splat_u32(7); |
142 |
const vector unsigned int vec_4 = vec_splat_u32(4); |
143 |
const vector signed int vec_4s = vec_splat_s32(4); |
144 |
const vector unsigned int vec_3 = vec_splat_u32(3); |
145 |
const vector unsigned int vec_2 = vec_splat_u32(2); |
146 |
const vector signed int vec_1s = vec_splat_s32(1); |
147 |
const vector unsigned int vec_1 = vec_splat_u32(1); |
148 |
const vector unsigned short rangered_shift = vec_splat_u16(1); |
149 |
const vector signed short signed_bias = vec_sl(vec_splat_s16(4), |
150 |
vec_splat_u16(4));
|
151 |
|
152 |
src0 = vec_ld( 0, block);
|
153 |
src1 = vec_ld( 16, block);
|
154 |
src2 = vec_ld( 32, block);
|
155 |
src3 = vec_ld( 48, block);
|
156 |
src4 = vec_ld( 64, block);
|
157 |
src5 = vec_ld( 80, block);
|
158 |
src6 = vec_ld( 96, block);
|
159 |
src7 = vec_ld(112, block);
|
160 |
|
161 |
s0 = vec_unpackl(src0); |
162 |
s1 = vec_unpackl(src1); |
163 |
s2 = vec_unpackl(src2); |
164 |
s3 = vec_unpackl(src3); |
165 |
s4 = vec_unpackl(src4); |
166 |
s5 = vec_unpackl(src5); |
167 |
s6 = vec_unpackl(src6); |
168 |
s7 = vec_unpackl(src7); |
169 |
s8 = vec_unpackh(src0); |
170 |
s9 = vec_unpackh(src1); |
171 |
sA = vec_unpackh(src2); |
172 |
sB = vec_unpackh(src3); |
173 |
sC = vec_unpackh(src4); |
174 |
sD = vec_unpackh(src5); |
175 |
sE = vec_unpackh(src6); |
176 |
sF = vec_unpackh(src7); |
177 |
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); |
178 |
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); |
179 |
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); |
180 |
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); |
181 |
src0 = vec_pack(s8, s0); |
182 |
src1 = vec_pack(s9, s1); |
183 |
src2 = vec_pack(sA, s2); |
184 |
src3 = vec_pack(sB, s3); |
185 |
src4 = vec_pack(sC, s4); |
186 |
src5 = vec_pack(sD, s5); |
187 |
src6 = vec_pack(sE, s6); |
188 |
src7 = vec_pack(sF, s7); |
189 |
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); |
190 |
|
191 |
s0 = vec_unpackl(src0); |
192 |
s1 = vec_unpackl(src1); |
193 |
s2 = vec_unpackl(src2); |
194 |
s3 = vec_unpackl(src3); |
195 |
s4 = vec_unpackl(src4); |
196 |
s5 = vec_unpackl(src5); |
197 |
s6 = vec_unpackl(src6); |
198 |
s7 = vec_unpackl(src7); |
199 |
s8 = vec_unpackh(src0); |
200 |
s9 = vec_unpackh(src1); |
201 |
sA = vec_unpackh(src2); |
202 |
sB = vec_unpackh(src3); |
203 |
sC = vec_unpackh(src4); |
204 |
sD = vec_unpackh(src5); |
205 |
sE = vec_unpackh(src6); |
206 |
sF = vec_unpackh(src7); |
207 |
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); |
208 |
SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); |
209 |
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); |
210 |
SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); |
211 |
src0 = vec_pack(s8, s0); |
212 |
src1 = vec_pack(s9, s1); |
213 |
src2 = vec_pack(sA, s2); |
214 |
src3 = vec_pack(sB, s3); |
215 |
src4 = vec_pack(sC, s4); |
216 |
src5 = vec_pack(sD, s5); |
217 |
src6 = vec_pack(sE, s6); |
218 |
src7 = vec_pack(sF, s7); |
219 |
|
220 |
if (rangered) {
|
221 |
if (!sign) {
|
222 |
src0 = vec_sub(src0, signed_bias); |
223 |
src1 = vec_sub(src1, signed_bias); |
224 |
src2 = vec_sub(src2, signed_bias); |
225 |
src3 = vec_sub(src3, signed_bias); |
226 |
src4 = vec_sub(src4, signed_bias); |
227 |
src5 = vec_sub(src5, signed_bias); |
228 |
src6 = vec_sub(src6, signed_bias); |
229 |
src7 = vec_sub(src7, signed_bias); |
230 |
} |
231 |
src0 = vec_sl(src0, rangered_shift); |
232 |
src1 = vec_sl(src1, rangered_shift); |
233 |
src2 = vec_sl(src2, rangered_shift); |
234 |
src3 = vec_sl(src3, rangered_shift); |
235 |
src4 = vec_sl(src4, rangered_shift); |
236 |
src5 = vec_sl(src5, rangered_shift); |
237 |
src6 = vec_sl(src6, rangered_shift); |
238 |
src7 = vec_sl(src7, rangered_shift); |
239 |
} |
240 |
|
241 |
vec_st(src0, 0, block);
|
242 |
vec_st(src1, 16, block);
|
243 |
vec_st(src2, 32, block);
|
244 |
vec_st(src3, 48, block);
|
245 |
vec_st(src4, 64, block);
|
246 |
vec_st(src5, 80, block);
|
247 |
vec_st(src6, 96, block);
|
248 |
vec_st(src7,112, block);
|
249 |
} |
250 |
|
251 |
static void vc1_inv_trans_8x8_add_altivec(uint8_t *dest, int stride, DCTELEM *b) |
252 |
{ |
253 |
vc1_inv_trans_8x8_altivec(b, 0, 0); |
254 |
ff_add_pixels_clamped_c(b, dest, stride); |
255 |
} |
256 |
|
257 |
static void vc1_inv_trans_8x8_put_signed_altivec(uint8_t *dest, int stride, DCTELEM *b) |
258 |
{ |
259 |
vc1_inv_trans_8x8_altivec(b, 1, 0); |
260 |
ff_put_signed_pixels_clamped_c(b, dest, stride); |
261 |
} |
262 |
|
263 |
static void vc1_inv_trans_8x8_put_signed_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b) |
264 |
{ |
265 |
vc1_inv_trans_8x8_altivec(b, 1, 1); |
266 |
ff_put_signed_pixels_clamped_c(b, dest, stride); |
267 |
} |
268 |
|
269 |
static void vc1_inv_trans_8x8_put_altivec(uint8_t *dest, int stride, DCTELEM *b) |
270 |
{ |
271 |
vc1_inv_trans_8x8_altivec(b, 0, 0); |
272 |
ff_put_pixels_clamped_c(b, dest, stride); |
273 |
} |
274 |
|
275 |
static void vc1_inv_trans_8x8_put_rangered_altivec(uint8_t *dest, int stride, DCTELEM *b) |
276 |
{ |
277 |
vc1_inv_trans_8x8_altivec(b, 0, 1); |
278 |
ff_put_pixels_clamped_c(b, dest, stride); |
279 |
} |
280 |
|
281 |
/** Do inverse transform on 8x4 part of block
|
282 |
*/
|
283 |
static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) |
284 |
{ |
285 |
vector signed short src0, src1, src2, src3, src4, src5, src6, src7; |
286 |
vector signed int s0, s1, s2, s3, s4, s5, s6, s7; |
287 |
vector signed int s8, s9, sA, sB, sC, sD, sE, sF; |
288 |
vector signed int t0, t1, t2, t3, t4, t5, t6, t7; |
289 |
const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); |
290 |
const vector unsigned int vec_7 = vec_splat_u32(7); |
291 |
const vector unsigned int vec_5 = vec_splat_u32(5); |
292 |
const vector unsigned int vec_4 = vec_splat_u32(4); |
293 |
const vector signed int vec_4s = vec_splat_s32(4); |
294 |
const vector unsigned int vec_3 = vec_splat_u32(3); |
295 |
const vector unsigned int vec_2 = vec_splat_u32(2); |
296 |
const vector unsigned int vec_1 = vec_splat_u32(1); |
297 |
vector unsigned char tmp; |
298 |
vector signed short tmp2, tmp3; |
299 |
vector unsigned char perm0, perm1, p0, p1, p; |
300 |
|
301 |
src0 = vec_ld( 0, block);
|
302 |
src1 = vec_ld( 16, block);
|
303 |
src2 = vec_ld( 32, block);
|
304 |
src3 = vec_ld( 48, block);
|
305 |
src4 = vec_ld( 64, block);
|
306 |
src5 = vec_ld( 80, block);
|
307 |
src6 = vec_ld( 96, block);
|
308 |
src7 = vec_ld(112, block);
|
309 |
|
310 |
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); |
311 |
s0 = vec_unpackl(src0); |
312 |
s1 = vec_unpackl(src1); |
313 |
s2 = vec_unpackl(src2); |
314 |
s3 = vec_unpackl(src3); |
315 |
s4 = vec_unpackl(src4); |
316 |
s5 = vec_unpackl(src5); |
317 |
s6 = vec_unpackl(src6); |
318 |
s7 = vec_unpackl(src7); |
319 |
s8 = vec_unpackh(src0); |
320 |
s9 = vec_unpackh(src1); |
321 |
sA = vec_unpackh(src2); |
322 |
sB = vec_unpackh(src3); |
323 |
sC = vec_unpackh(src4); |
324 |
sD = vec_unpackh(src5); |
325 |
sE = vec_unpackh(src6); |
326 |
sF = vec_unpackh(src7); |
327 |
STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); |
328 |
SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); |
329 |
STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); |
330 |
SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); |
331 |
src0 = vec_pack(s8, s0); |
332 |
src1 = vec_pack(s9, s1); |
333 |
src2 = vec_pack(sA, s2); |
334 |
src3 = vec_pack(sB, s3); |
335 |
src4 = vec_pack(sC, s4); |
336 |
src5 = vec_pack(sD, s5); |
337 |
src6 = vec_pack(sE, s6); |
338 |
src7 = vec_pack(sF, s7); |
339 |
TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); |
340 |
|
341 |
s0 = vec_unpackh(src0); |
342 |
s1 = vec_unpackh(src1); |
343 |
s2 = vec_unpackh(src2); |
344 |
s3 = vec_unpackh(src3); |
345 |
s8 = vec_unpackl(src0); |
346 |
s9 = vec_unpackl(src1); |
347 |
sA = vec_unpackl(src2); |
348 |
sB = vec_unpackl(src3); |
349 |
STEP4(s0, s1, s2, s3, vec_64); |
350 |
SHIFT_VERT4(s0, s1, s2, s3); |
351 |
STEP4(s8, s9, sA, sB, vec_64); |
352 |
SHIFT_VERT4(s8, s9, sA, sB); |
353 |
src0 = vec_pack(s0, s8); |
354 |
src1 = vec_pack(s1, s9); |
355 |
src2 = vec_pack(s2, sA); |
356 |
src3 = vec_pack(s3, sB); |
357 |
|
358 |
p0 = vec_lvsl (0, dest);
|
359 |
p1 = vec_lvsl (stride, dest); |
360 |
p = vec_splat_u8 (-1);
|
361 |
perm0 = vec_mergeh (p, p0); |
362 |
perm1 = vec_mergeh (p, p1); |
363 |
|
364 |
#define ADD(dest,src,perm) \
|
365 |
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
|
366 |
tmp = vec_ld (0, dest); \
|
367 |
tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ |
368 |
tmp3 = vec_adds (tmp2, src); \ |
369 |
tmp = vec_packsu (tmp3, tmp3); \ |
370 |
vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ |
371 |
vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); |
372 |
|
373 |
ADD (dest, src0, perm0) dest += stride; |
374 |
ADD (dest, src1, perm1) dest += stride; |
375 |
ADD (dest, src2, perm0) dest += stride; |
376 |
ADD (dest, src3, perm1) |
377 |
} |
378 |
|
379 |
#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
|
380 |
#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
|
381 |
|
382 |
#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
|
383 |
#define PREFIX_no_rnd_vc1_chroma_mc8_altivec put_no_rnd_vc1_chroma_mc8_altivec
|
384 |
#include "h264_template_altivec.c" |
385 |
#undef OP_U8_ALTIVEC
|
386 |
#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
|
387 |
|
388 |
#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
|
389 |
#define PREFIX_no_rnd_vc1_chroma_mc8_altivec avg_no_rnd_vc1_chroma_mc8_altivec
|
390 |
#include "h264_template_altivec.c" |
391 |
#undef OP_U8_ALTIVEC
|
392 |
#undef PREFIX_no_rnd_vc1_chroma_mc8_altivec
|
393 |
|
394 |
void ff_vc1dsp_init_altivec(VC1DSPContext* dsp)
|
395 |
{ |
396 |
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
|
397 |
return;
|
398 |
|
399 |
dsp->vc1_inv_trans_8x8_add = vc1_inv_trans_8x8_add_altivec; |
400 |
dsp->vc1_inv_trans_8x8_put_signed[0] = vc1_inv_trans_8x8_put_signed_altivec;
|
401 |
dsp->vc1_inv_trans_8x8_put_signed[1] = vc1_inv_trans_8x8_put_signed_rangered_altivec;
|
402 |
dsp->vc1_inv_trans_8x8_put[0] = vc1_inv_trans_8x8_put_altivec;
|
403 |
dsp->vc1_inv_trans_8x8_put[1] = vc1_inv_trans_8x8_put_rangered_altivec;
|
404 |
dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; |
405 |
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = put_no_rnd_vc1_chroma_mc8_altivec;
|
406 |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = avg_no_rnd_vc1_chroma_mc8_altivec;
|
407 |
} |