Revision 6c618a26
libavcodec/ppc/fdct_altivec.c  

214  214  
215  215 
/* setup constants {{{ */ 
216  216 
/* mzero = 0.0 */ 
217 
vu32(mzero) = vec_splat_u32(1);


218 
vu32(mzero) = vec_sl(vu32(mzero), vu32(mzero));


217 
mzero = ((vector float)vec_splat_u32(1));


218 
mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero)));


219  219 
cp = fdctconsts; 
220  220 
cnsts0 = vec_ld(0, cp); cp++; 
221  221 
cnsts1 = vec_ld(0, cp); cp++; 
...  ...  
227  227 
#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) 
228  228  
229  229 
bp = (vector signed short*)block; 
230 
vs16(b00) = vec_ld(0, bp);


231 
vs16(b40) = vec_ld(16*4, bp);


232 
vs16(b01) = MERGE_S16(h, b00, b40);


233 
vs16(b11) = MERGE_S16(l, b00, b40);


230 
b00 = ((vector float)vec_ld(0, bp));


231 
b40 = ((vector float)vec_ld(16*4, bp));


232 
b01 = ((vector float)MERGE_S16(h, b00, b40));


233 
b11 = ((vector float)MERGE_S16(l, b00, b40));


234  234 
bp++; 
235 
vs16(b10) = vec_ld(0, bp);


236 
vs16(b50) = vec_ld(16*4, bp);


237 
vs16(b21) = MERGE_S16(h, b10, b50);


238 
vs16(b31) = MERGE_S16(l, b10, b50);


235 
b10 = ((vector float)vec_ld(0, bp));


236 
b50 = ((vector float)vec_ld(16*4, bp));


237 
b21 = ((vector float)MERGE_S16(h, b10, b50));


238 
b31 = ((vector float)MERGE_S16(l, b10, b50));


239  239 
bp++; 
240 
vs16(b20) = vec_ld(0, bp);


241 
vs16(b60) = vec_ld(16*4, bp);


242 
vs16(b41) = MERGE_S16(h, b20, b60);


243 
vs16(b51) = MERGE_S16(l, b20, b60);


240 
b20 = ((vector float)vec_ld(0, bp));


241 
b60 = ((vector float)vec_ld(16*4, bp));


242 
b41 = ((vector float)MERGE_S16(h, b20, b60));


243 
b51 = ((vector float)MERGE_S16(l, b20, b60));


244  244 
bp++; 
245 
vs16(b30) = vec_ld(0, bp);


246 
vs16(b70) = vec_ld(16*4, bp);


247 
vs16(b61) = MERGE_S16(h, b30, b70);


248 
vs16(b71) = MERGE_S16(l, b30, b70);


249  
250 
vs16(x0) = MERGE_S16(h, b01, b41);


251 
vs16(x1) = MERGE_S16(l, b01, b41);


252 
vs16(x2) = MERGE_S16(h, b11, b51);


253 
vs16(x3) = MERGE_S16(l, b11, b51);


254 
vs16(x4) = MERGE_S16(h, b21, b61);


255 
vs16(x5) = MERGE_S16(l, b21, b61);


256 
vs16(x6) = MERGE_S16(h, b31, b71);


257 
vs16(x7) = MERGE_S16(l, b31, b71);


258  
259 
vs16(b00) = MERGE_S16(h, x0, x4);


260 
vs16(b10) = MERGE_S16(l, x0, x4);


261 
vs16(b20) = MERGE_S16(h, x1, x5);


262 
vs16(b30) = MERGE_S16(l, x1, x5);


263 
vs16(b40) = MERGE_S16(h, x2, x6);


264 
vs16(b50) = MERGE_S16(l, x2, x6);


265 
vs16(b60) = MERGE_S16(h, x3, x7);


266 
vs16(b70) = MERGE_S16(l, x3, x7);


245 
b30 = ((vector float)vec_ld(0, bp));


246 
b70 = ((vector float)vec_ld(16*4, bp));


247 
b61 = ((vector float)MERGE_S16(h, b30, b70));


248 
b71 = ((vector float)MERGE_S16(l, b30, b70));


249  
250 
x0 = ((vector float)MERGE_S16(h, b01, b41));


251 
x1 = ((vector float)MERGE_S16(l, b01, b41));


252 
x2 = ((vector float)MERGE_S16(h, b11, b51));


253 
x3 = ((vector float)MERGE_S16(l, b11, b51));


254 
x4 = ((vector float)MERGE_S16(h, b21, b61));


255 
x5 = ((vector float)MERGE_S16(l, b21, b61));


256 
x6 = ((vector float)MERGE_S16(h, b31, b71));


257 
x7 = ((vector float)MERGE_S16(l, b31, b71));


258  
259 
b00 = ((vector float)MERGE_S16(h, x0, x4));


260 
b10 = ((vector float)MERGE_S16(l, x0, x4));


261 
b20 = ((vector float)MERGE_S16(h, x1, x5));


262 
b30 = ((vector float)MERGE_S16(l, x1, x5));


263 
b40 = ((vector float)MERGE_S16(h, x2, x6));


264 
b50 = ((vector float)MERGE_S16(l, x2, x6));


265 
b60 = ((vector float)MERGE_S16(h, x3, x7));


266 
b70 = ((vector float)MERGE_S16(l, x3, x7));


267  267  
268  268 
#undef MERGE_S16 
269  269 
/* }}} */ 
...  ...  
275  275 
*/ 
276  276 
#if 1 
277  277 
/* fdct rows {{{ */ 
278 
vs16(x0) = vec_add(vs16(b00), vs16(b70));


279 
vs16(x7) = vec_sub(vs16(b00), vs16(b70));


280 
vs16(x1) = vec_add(vs16(b10), vs16(b60));


281 
vs16(x6) = vec_sub(vs16(b10), vs16(b60));


282 
vs16(x2) = vec_add(vs16(b20), vs16(b50));


283 
vs16(x5) = vec_sub(vs16(b20), vs16(b50));


284 
vs16(x3) = vec_add(vs16(b30), vs16(b40));


285 
vs16(x4) = vec_sub(vs16(b30), vs16(b40));


278 
x0 = ((vector float)vec_add(vs16(b00), vs16(b70)));


279 
x7 = ((vector float)vec_sub(vs16(b00), vs16(b70)));


280 
x1 = ((vector float)vec_add(vs16(b10), vs16(b60)));


281 
x6 = ((vector float)vec_sub(vs16(b10), vs16(b60)));


282 
x2 = ((vector float)vec_add(vs16(b20), vs16(b50)));


283 
x5 = ((vector float)vec_sub(vs16(b20), vs16(b50)));


284 
x3 = ((vector float)vec_add(vs16(b30), vs16(b40)));


285 
x4 = ((vector float)vec_sub(vs16(b30), vs16(b40)));


286  286  
287 
vs16(b70) = vec_add(vs16(x0), vs16(x3));


288 
vs16(b10) = vec_add(vs16(x1), vs16(x2));


287 
b70 = ((vector float)vec_add(vs16(x0), vs16(x3)));


288 
b10 = ((vector float)vec_add(vs16(x1), vs16(x2)));


289  289  
290 
vs16(b00) = vec_add(vs16(b70), vs16(b10));


291 
vs16(b40) = vec_sub(vs16(b70), vs16(b10));


290 
b00 = ((vector float)vec_add(vs16(b70), vs16(b10)));


291 
b40 = ((vector float)vec_sub(vs16(b70), vs16(b10)));


292  292  
293  293 
#define CTF0(n) \ 
294 
vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \


295 
vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \


294 
b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \


295 
b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \


296  296 
b##n##1 = vec_ctf(vs32(b##n##1), 0); \ 
297  297 
b##n##0 = vec_ctf(vs32(b##n##0), 0); 
298  298  
299  299 
CTF0(0); 
300  300 
CTF0(4); 
301  301  
302 
vs16(b20) = vec_sub(vs16(x0), vs16(x3));


303 
vs16(b60) = vec_sub(vs16(x1), vs16(x2));


302 
b20 = ((vector float)vec_sub(vs16(x0), vs16(x3)));


303 
b60 = ((vector float)vec_sub(vs16(x1), vs16(x2)));


304  304  
305  305 
CTF0(2); 
306  306 
CTF0(6); 
...  ...  
321  321 
b61 = vec_madd(cnst, b61, x1); 
322  322  
323  323 
#define CTFX(x,b) \ 
324 
vs32(b##0) = vec_unpackh(vs16(x)); \


325 
vs32(b##1) = vec_unpackl(vs16(x)); \


324 
b##0 = ((vector float)vec_unpackh(vs16(x))); \


325 
b##1 = ((vector float)vec_unpackl(vs16(x))); \


326  326 
b##0 = vec_ctf(vs32(b##0), 0); \ 
327  327 
b##1 = vec_ctf(vs32(b##1), 0); \ 
328  328  
...  ...  
473  473 
#define CTS(n) \ 
474  474 
b##n##0 = vec_round(b##n##0); \ 
475  475 
b##n##1 = vec_round(b##n##1); \ 
476 
vs32(b##n##0) = vec_cts(b##n##0, 0); \


477 
vs32(b##n##1) = vec_cts(b##n##1, 0); \


478 
vs16(b##n##0) = vec_pack(vs32(b##n##0), vs32(b##n##1)); \


476 
b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \


477 
b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \


478 
b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \


479  479 
vec_st(vs16(b##n##0), 0, bp); 
480  480  
481  481 
bp = (vector signed short*)block; 
Also available in: Unified diff