/*


* Copyright (c) 2002 Brian Foley

* Copyright (c) 2002 Dieter Shirley

* Copyright (c) 20032004 Romain Dolbeau <romain@dolbeau.org>

*

* This file is part of FFmpeg.

*

* FFmpeg is free software; you can redistribute it and/or

* modify it under the terms of the GNU Lesser General Public

* License as published by the Free Software Foundation; either

* version 2.1 of the License, or (at your option) any later version.

*

* FFmpeg is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

* Lesser General Public License for more details.

*

* You should have received a copy of the GNU Lesser General Public

* License along with FFmpeg; if not, write to the Free Software

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

*/

23 
#include "dsputil.h" 
25 
#include "gcc_fixes.h" 
27 
#include "dsputil_ppc.h" 
#include "util_altivec.h" 
30 
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector unsigned char *tv; 
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
40 
s = 0;

sad = (vector unsigned int)vec_splat_u32(0); 
for(i=0;i<h;i++) { 
/*

Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

*/

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
51 
tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
54 
tv = (vector unsigned char *) &pix2[1]; 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
57 
/* Calculate the average vector */

avgv = vec_avg(pix2v, pix2iv); 
59  
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
62  
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
65  
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

74 
return s;

} 
77 
int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector unsigned char *tv; 
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
uint8_t *pix3 = pix2 + line_size; 
88 
s = 0;

sad = (vector unsigned int)vec_splat_u32(0); 
91 
/*

Due to the fact that pix3 = pix2 + line_size, the pix3 of one

iteration becomes pix2 in the next iteration. We can use this

fact to avoid a potentially expensive unaligned read, each

time around the loop.

Read unaligned pixels into our vectors. The vectors are as follows:

pix2v: pix2[0]pix2[15]

Split the pixel vectors into shorts

*/

tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
103 
for(i=0;i<h;i++) { 
/*

Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix3v: pix3[0]pix3[15]

*/

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
112 
tv = (vector unsigned char *) &pix3[0]; 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
115 
/* Calculate the average vector */

avgv = vec_avg(pix2v, pix3v); 
117  
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
121 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
124 
pix1 += line_size; 
pix2v = pix3v; 
pix3 += line_size; 
128 
} 
130 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
137 
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

uint8_t *pix3 = pix2 + line_size; 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); 
vector unsigned char *tv, avgv, t5; 
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 
vector unsigned short avghv, avglv; 
vector unsigned short t1, t2, t3, t4; 
vector unsigned int sad; 
vector signed int sumdiffs; 
153 
sad = (vector unsigned int)vec_splat_u32(0); 
155 
s = 0;

157 
/*

Due to the fact that pix3 = pix2 + line_size, the pix3 of one

iteration becomes pix2 in the next iteration. We can use this

fact to avoid a potentially expensive unaligned read, as well

as some splitting, and vector addition each time around the loop.

Read unaligned pixels into our vectors. The vectors are as follows:

pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

Split the pixel vectors into shorts

*/

tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
169 
tv = (vector unsigned char *) &pix2[1]; 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
172 
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 
t1 = vec_add(pix2hv, pix2ihv); 
t2 = vec_add(pix2lv, pix2ilv); 
179 
for(i=0;i<h;i++) { 
/*

Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix3v: pix3[0]pix3[15] pix3iv: pix3[1]pix3[16]

*/

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
188 
tv = (vector unsigned char *) &pix3[0]; 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
191 
tv = (vector unsigned char *) &pix3[1]; 
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 
194 
/*

Note that AltiVec does have vec_avg, but this works on vector pairs

and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding

would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.

Instead, we have to split the pixel vectors into vectors of shorts,

and do the averaging by hand.

*/

202 
/* Split the pixel vectors into shorts */

pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 
208 
/* Do the averaging on them */

t3 = vec_add(pix3hv, pix3ihv); 
t4 = vec_add(pix3lv, pix3ilv); 
212 
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 
215 
/* Pack the shorts back into a result */

avgv = vec_pack(avghv, avglv); 
218 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
221 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
224 
pix1 += line_size; 
pix3 += line_size; 
/* Transfer the calculated values for pix3 into pix2 */

t1 = t3; 
t2 = t4; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

235 
return s;

} 
238 
int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
248 
sad = (vector unsigned int)vec_splat_u32(0); 
249  
251 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2 */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
260 
/* Calculate a sum of abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
265 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
268 
pix1 += line_size; 
pix2 += line_size; 
} 
272 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

277 
return s;

} 
280 
int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
290 
sad = (vector unsigned int)vec_splat_u32(0); 
292 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
294 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2

Since we're reading 16 pixels, and actually only want 8,

mask out the last 8 pixels. The 0s don't change the sum. */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
305 
/* Calculate a sum of abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
310 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
313 
pix1 += line_size; 
pix2 += line_size; 
} 
317 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

322 
return s;

} 
325 
int pix_norm1_altivec(uint8_t *pix, int line_size) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char *tv; 
vector unsigned char pixv; 
vector unsigned int sv; 
vector signed int sum; 
335 
sv = (vector unsigned int)vec_splat_u32(0); 
337 
s = 0;

for (i = 0; i < 16; i++) { 
/* Read in the potentially unaligned pixels */

tv = (vector unsigned char *) pix; 
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 
343 
/* Square the values, and add them to our sum */

sv = vec_msum(pixv, pixv, sv); 
346 
pix += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sum = vec_sums((vector signed int) sv, (vector signed int) zero); 
sum = vec_splat(sum, 3);

vec_ste(sum, 0, &s);

353 
return s;

} 
356 
/**

* Sum of Squared Errors for a 8x8 block.

* AltiVecenhanced.

* It's the sad8_altivec code above w/ squaring added.

*/

int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
362 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sum; 
vector signed int sumsqr; 
371 
sum = (vector unsigned int)vec_splat_u32(0); 
373 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
374  
376 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2

Since we're reading 16 pixels, and actually only want 8,

mask out the last 8 pixels. The 0s don't change the sum. */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
387 
/*

Since we want to use unsigned chars, we can take advantage

of the fact that abs(ab)^2 = (ab)^2.

*/

392 
/* Calculate abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
397 
/* Square the values and add them to our sum */

sum = vec_msum(t5, t5, sum); 
400 
pix1 += line_size; 
pix2 += line_size; 
} 
404 
/* Sum up the four partial sums, and put the result into s */

sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
sumsqr = vec_splat(sumsqr, 3);

vec_ste(sumsqr, 0, &s);

409 
410 
411  
/**

* Sum of Squared Errors for a 16x16 block.

* AltiVecenhanced.

* It's the sad16_altivec code above w/ squaring added.

*/

int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

DECLARE_ALIGNED_16(int, s);

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
422 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
424 
425 
426  
sum = (vector unsigned int)vec_splat_u32(0); 
429 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2 */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
438 
/*

Since we want to use unsigned chars, we can take advantage

of the fact that abs(ab)^2 = (ab)^2.

*/

443 
/* Calculate abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
448 
/* Square the values and add them to our sum */

sum = vec_msum(t5, t5, sum); 
451 
pix1 += line_size; 
pix2 += line_size; 
} 
455 
/* Sum up the four partial sums, and put the result into s */

sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
sumsqr = vec_splat(sumsqr, 3);

vec_ste(sumsqr, 0, &s);

460 
return s;

} 
463 
int pix_sum_altivec(uint8_t * pix, int line_size) 
{ 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm, *pixv; 
vector unsigned char t1; 
vector unsigned int sad; 
vector signed int sumdiffs; 
471 
int i;

DECLARE_ALIGNED_16(int, s);

474 
sad = (vector unsigned int)vec_splat_u32(0); 
476 
for (i = 0; i < 16; i++) { 
/* Read the potentially unaligned 16 pixels into t1 */

perm = vec_lvsl(0, pix);

pixv = (vector unsigned char *) pix; 
t1 = vec_perm(pixv[0], pixv[1], perm); 
482 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t1, sad); 
485 
pix += line_size; 
} 
488 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

493 
return s;

} 
496 
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 
{ 
int i;

vector unsigned char perm, bytes, *pixv; 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector signed short shorts; 
503 
for(i=0;i<8;i++) 
{ 
// Read potentially unaligned pixels.

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, pixels);

pixv = (vector unsigned char *) pixels; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
512 
513 
514  
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts, i*16, (vector signed short*)block); 
518 
pixels += line_size; 
} 
} 
522 
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 
const uint8_t *s2, int stride) 
{ 
int i;

vector unsigned char perm, bytes, *pixv; 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector signed short shorts1, shorts2; 
530 
531 
532 
533 
534 
535 
536 
537 
538  
// convert the bytes into shorts

540 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
542 
// Do the same for the second block of pixels

perm = vec_lvsl(0, s2);

pixv = (vector unsigned char *) s2; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
547 
// convert the bytes into shorts

shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
550 
// Do the subtraction

shorts1 = vec_sub(shorts1, shorts2); 
553 
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts1, 0, (vector signed short*)block); 
556 
s1 += stride; 
s2 += stride; 
block += 8;

560  
561 
// The code below is a copy of the code above... This is a manual

// unroll.

564 
// Read potentially unaligned pixels

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, s1);

pixv = (vector unsigned char *) s1; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
571 
// convert the bytes into shorts

shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
574 
// Do the same for the second block of pixels

perm = vec_lvsl(0, s2);

pixv = (vector unsigned char *) s2; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
579 
// convert the bytes into shorts

shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
582 
// Do the subtraction

shorts1 = vec_sub(shorts1, shorts2); 
585 
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts1, 0, (vector signed short*)block); 
588 
s1 += stride; 
s2 += stride; 
block += 8;

} 
} 
594 
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 
register int i; 
register vector unsigned char vdst, vsrc; 
598 
/* dst and src are 16 bytesaligned (guaranteed) */

for(i = 0 ; (i + 15) < w ; i+=16) 
{ 
vdst = vec_ld(i, (unsigned char*)dst); 
vsrc = vec_ld(i, (unsigned char*)src); 
vdst = vec_add(vsrc, vdst); 
vec_st(vdst, i, (unsigned char*)dst); 
} 
/* if w is not a multiple of 16 */

for (; (i < w) ; i++)

{ 
dst[i] = src[i]; 
} 
} 
613 
/* next one assumes that ((line_size % 16) == 0) */

void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
{ 
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);

register vector unsigned char pixelsv1, pixelsv2; 
register vector unsigned char pixelsv1B, pixelsv2B; 
register vector unsigned char pixelsv1C, pixelsv2C; 
register vector unsigned char pixelsv1D, pixelsv2D; 
622 
register vector unsigned char perm = vec_lvsl(0, pixels); 
int i;

register int line_size_2 = line_size << 1; 
register int line_size_3 = line_size + line_size_2; 
register int line_size_4 = line_size << 2; 
628 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);

// handunrolling the loop by 4 gains about 15%

// mininum execution time goes from 74 to 60 cycles

// it's faster than funrollloops, but using

// funrollloops w/ this is bad  74 cycles again.

// all this is on a 7450, tuning for the 7450

#if 0

for(i=0; i<h; i++) {

pixelsv1 = vec_ld(0, (unsigned char*)pixels);

pixelsv2 = vec_ld(16, (unsigned char*)pixels);

vec_st(vec_perm(pixelsv1, pixelsv2, perm),

0, (unsigned char*)block);

pixels+=line_size;

block +=line_size;

}

#else

for(i=0; i<h; i+=4) { 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
pixelsv2 = vec_ld(15, (unsigned char*)pixels); 
pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); 
pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels); 
pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); 
pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels); 
pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); 
pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels); 
vec_st(vec_perm(pixelsv1, pixelsv2, perm), 
0, (unsigned char*)block); 
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 
line_size, (unsigned char*)block); 
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 
line_size_2, (unsigned char*)block); 
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 
line_size_3, (unsigned char*)block); 
pixels+=line_size_4; 
block +=line_size_4; 
} 
#endif

POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);

} 
668 
/* next one assumes that ((line_size % 16) == 0) */

#define op_avg(a,b) a = ( ((a)(b))  ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
{ 
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);

register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
register vector unsigned char perm = vec_lvsl(0, pixels); 
int i;

677 
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);

679 
for(i=0; i<h; i++) { 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
blockv = vec_ld(0, block);

pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 
blockv = vec_avg(blockv,pixelsv); 
vec_st(blockv, 0, (unsigned char*)block); 
pixels+=line_size; 
block +=line_size; 
} 
690 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);

} 
693 
/* next one assumes that ((line_size % 8) == 0) */

void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
{ 
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);

register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
int i;

700 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);

702 
for (i = 0; i < h; i++) { 
/*

block is 8 bytesaligned, so we're either in the

left block (16 bytesaligned) or in the right block (not)

*/

int rightside = ((unsigned long)block & 0x0000000F); 
709 
blockv = vec_ld(0, block);

pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

714 
715 
{ 
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 
} 
else

719 
720 
721 
723 
blockv = vec_avg(blockv, pixelsv); 
725 
vec_st(blockv, 0, block);

727 
pixels += line_size; 
block += line_size; 
} 
731 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);

} 
734 
/* next one assumes that ((line_size % 8) == 0) */

void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
{ 
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);

register int i; 
739 
register vector unsigned char 
740 
pixelsv1, pixelsv2, 
741 
pixelsavg; 
742 
register vector unsigned char 
743 
blockv, temp1, temp2; 
744 
register vector unsigned short 
745 
pixelssum1, pixelssum2, temp3; 
746 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
747 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
748  
749 
temp1 = vec_ld(0, pixels);

750 
temp2 = vec_ld(16, pixels);

751 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

752 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
753 
{ 
754 
pixelsv2 = temp2; 
755 
} 
756 
else

757 
{ 
758 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

759 
} 
760 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
761 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
762 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
763 
(vector unsigned short)pixelsv2); 
764 
pixelssum1 = vec_add(pixelssum1, vctwo); 
765  
766 
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);

767 
for (i = 0; i < h ; i++) { 
768 
int rightside = ((unsigned long)block & 0x0000000F); 
769 
blockv = vec_ld(0, block);

770  
771 
temp1 = vec_ld(line_size, pixels); 
772 
temp2 = vec_ld(line_size + 16, pixels);

773 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
774 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
775 
{ 
776 
pixelsv2 = temp2; 
777 
} 
778 
else

779 
{ 
780 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

781 
} 
782  
783 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
784 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
785 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
786 
(vector unsigned short)pixelsv2); 
787 
temp3 = vec_add(pixelssum1, pixelssum2); 
788 
temp3 = vec_sra(temp3, vctwo); 
789 
pixelssum1 = vec_add(pixelssum2, vctwo); 
790 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
791  
792 
if (rightside)

793 
{ 
794 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
795 
} 
796 
else

797 
{ 
798 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
799 
} 
800  
801 
vec_st(blockv, 0, block);

802  
803 
block += line_size; 
804 
pixels += line_size; 
805 
} 
806  
807 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);

808 
} 
809  
810 
/* next one assumes that ((line_size % 8) == 0) */

811 
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
812 
{ 
813 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);

814 
register int i; 
815 
register vector unsigned char 
816 
pixelsv1, pixelsv2, 
817 
pixelsavg; 
818 
register vector unsigned char 
819 
blockv, temp1, temp2; 
820 
register vector unsigned short 
821 
pixelssum1, pixelssum2, temp3; 
822 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
823 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
824 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
825  
826 
temp1 = vec_ld(0, pixels);

827 
temp2 = vec_ld(16, pixels);

828 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

829 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
830 
{ 
831 
pixelsv2 = temp2; 
832 
} 
833 
else

834 
{ 
835 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

836 
} 
837 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
838 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
839 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
840 
(vector unsigned short)pixelsv2); 
841 
pixelssum1 = vec_add(pixelssum1, vcone); 
842  
843 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

844 
for (i = 0; i < h ; i++) { 
845 
int rightside = ((unsigned long)block & 0x0000000F); 
846 
blockv = vec_ld(0, block);

847  
848 
temp1 = vec_ld(line_size, pixels); 
849 
temp2 = vec_ld(line_size + 16, pixels);

850 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
851 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
852 
{ 
853 
pixelsv2 = temp2; 
854 
} 
855 
else

856 
{ 
857 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

858 
} 
859  
860 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
861 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
862 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
863 
(vector unsigned short)pixelsv2); 
864 
temp3 = vec_add(pixelssum1, pixelssum2); 
865 
temp3 = vec_sra(temp3, vctwo); 
866 
pixelssum1 = vec_add(pixelssum2, vcone); 
867 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
868  
869 
if (rightside)

870 
{ 
871 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
872 
} 
873 
else

874 
{ 
875 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
876 
} 
877  
878 
vec_st(blockv, 0, block);

879  
880 
block += line_size; 
881 
pixels += line_size; 
882 
} 
883  
884 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

885 
} 
886  
887 
/* next one assumes that ((line_size % 16) == 0) */

888 
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
889 
{ 
890 
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);

891 
register int i; 
892 
register vector unsigned char 
893 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
894 
register vector unsigned char 
895 
blockv, temp1, temp2; 
896 
register vector unsigned short 
897 
pixelssum1, pixelssum2, temp3, 
898 
pixelssum3, pixelssum4, temp4; 
899 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
900 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
901  
902 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);

903  
904 
temp1 = vec_ld(0, pixels);

905 
temp2 = vec_ld(16, pixels);

906 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

907 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
908 
{ 
909 
pixelsv2 = temp2; 
910 
} 
911 
else

912 
{ 
913 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

914 
} 
915 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
916 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
917 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
918 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
919 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
920 
(vector unsigned short)pixelsv4); 
921 
pixelssum3 = vec_add(pixelssum3, vctwo); 
922 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
923 
(vector unsigned short)pixelsv2); 
924 
pixelssum1 = vec_add(pixelssum1, vctwo); 
925  
926 
for (i = 0; i < h ; i++) { 
927 
blockv = vec_ld(0, block);

928  
929 
temp1 = vec_ld(line_size, pixels); 
930 
temp2 = vec_ld(line_size + 16, pixels);

931 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
932 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
933 
{ 
934 
pixelsv2 = temp2; 
935 
} 
936 
else

937 
{ 
938 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

939 
} 
940  
941 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
942 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
943 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
944 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
945  
946 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
947 
(vector unsigned short)pixelsv4); 
948 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
949 
(vector unsigned short)pixelsv2); 
950 
temp4 = vec_add(pixelssum3, pixelssum4); 
951 
temp4 = vec_sra(temp4, vctwo); 
952 
temp3 = vec_add(pixelssum1, pixelssum2); 
953 
temp3 = vec_sra(temp3, vctwo); 
954  
955 
pixelssum3 = vec_add(pixelssum4, vctwo); 
956 
pixelssum1 = vec_add(pixelssum2, vctwo); 
957  
958 
blockv = vec_packsu(temp3, temp4); 
959  
960 
vec_st(blockv, 0, block);

961  
962 
block += line_size; 
963 
pixels += line_size; 
964 
} 
965  
966 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);

967 
} 
968  
969 
/* next one assumes that ((line_size % 16) == 0) */

970 
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
971 
{ 
972 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);

973 
register int i; 
974 
register vector unsigned char 
975 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
976 
register vector unsigned char 
977 
blockv, temp1, temp2; 
978 
register vector unsigned short 
979 
pixelssum1, pixelssum2, temp3, 
980 
pixelssum3, pixelssum4, temp4; 
981 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
982 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
983 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
984  
985 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

986  
987 
temp1 = vec_ld(0, pixels);

988 
temp2 = vec_ld(16, pixels);

989 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

990 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
991 
{ 
992 
pixelsv2 = temp2; 
993 
} 
994 
else

995 
{ 
996 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

997 
} 
998 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
999 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1000 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1001 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1002 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
1003 
(vector unsigned short)pixelsv4); 
1004 
pixelssum3 = vec_add(pixelssum3, vcone); 
1005 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1006 
(vector unsigned short)pixelsv2); 
1007 
pixelssum1 = vec_add(pixelssum1, vcone); 
1008  
1009 
for (i = 0; i < h ; i++) { 
1010 
blockv = vec_ld(0, block);

1011  
1012 
temp1 = vec_ld(line_size, pixels); 
1013 
temp2 = vec_ld(line_size + 16, pixels);

1014 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1015 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1016 
{ 
1017 
pixelsv2 = temp2; 
1018 
} 
1019 
else

1020 
{ 
1021 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1022 
} 
1023  
1024 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1025 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1026 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1027 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1028  
1029 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
1030 
(vector unsigned short)pixelsv4); 
1031 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1032 
(vector unsigned short)pixelsv2); 
1033 
temp4 = vec_add(pixelssum3, pixelssum4); 
1034 
temp4 = vec_sra(temp4, vctwo); 
1035 
temp3 = vec_add(pixelssum1, pixelssum2); 
1036 
temp3 = vec_sra(temp3, vctwo); 
1037  
1038 
pixelssum3 = vec_add(pixelssum4, vcone); 
1039 
pixelssum1 = vec_add(pixelssum2, vcone); 
1040  
1041 
blockv = vec_packsu(temp3, temp4); 
1042  
1043 
vec_st(blockv, 0, block);

1044  
1045 
block += line_size; 
1046 
pixels += line_size; 
1047 
} 
1048  
1049 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1050 
} 
1051  
1052 
int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
1053 
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);

1054 
int sum;

1055 
register const vector unsigned char vzero = 
1056 
(const vector unsigned char)vec_splat_u8(0); 
1057 
register vector signed short temp0, temp1, temp2, temp3, temp4, 
1058 
temp5, temp6, temp7; 
1059 
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);

1060 
{ 
1061 
register const vector signed short vprod1 =(const vector signed short) 
1062 
AVV( 1,1, 1,1, 1,1, 1,1); 
1063 
register const vector signed short vprod2 =(const vector signed short) 
1064 
AVV( 1, 1,1,1, 1, 1,1,1); 
1065 
register const vector signed short vprod3 =(const vector signed short) 
1066 
AVV( 1, 1, 1, 1,1,1,1,1); 
1067 
register const vector unsigned char perm1 = (const vector unsigned char) 
1068 
AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
1069 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); 
1070 
register const vector unsigned char perm2 = (const vector unsigned char) 
1071 
AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
1072 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); 
1073 
register const vector unsigned char perm3 = (const vector unsigned char) 
1074 
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
1075 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); 
1076  
1077 
#define ONEITERBUTTERFLY(i, res) \

1078 
{ \ 
1079 
register vector unsigned char src1, src2, srcO; \ 
1080 
register vector unsigned char dst1, dst2, dstO; \ 
1081 
register vector signed short srcV, dstV; \ 
1082 
register vector signed short but0, but1, but2, op1, op2, op3; \ 
1083 
src1 = vec_ld(stride * i, src); \ 
1084 
src2 = vec_ld((stride * i) + 15, src); \

1085 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1086 
dst1 = vec_ld(stride * i, dst); \ 
1087 
dst2 = vec_ld((stride * i) + 15, dst); \

1088 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1089 
/* promote the unsigned chars to signed shorts */ \

1090 
/* we're in the 8x8 function, we only care for the first 8 */ \

1091 
srcV = \ 
1092 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1093 
(vector signed char)srcO); \ 
1094 
dstV = \ 
1095 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1096 
(vector signed char)dstO); \ 
1097 
/* subtractions inside the first butterfly */ \

1098 
but0 = vec_sub(srcV, dstV); \ 
1099 
op1 = vec_perm(but0, but0, perm1); \ 
1100 
but1 = vec_mladd(but0, vprod1, op1); \ 
1101 
op2 = vec_perm(but1, but1, perm2); \ 
1102 
but2 = vec_mladd(but1, vprod2, op2); \ 
1103 
op3 = vec_perm(but2, but2, perm3); \ 
1104 
res = vec_mladd(but2, vprod3, op3); \ 
1105 
} 
1106 
ONEITERBUTTERFLY(0, temp0);

1107 
ONEITERBUTTERFLY(1, temp1);

1108 
ONEITERBUTTERFLY(2, temp2);

1109 
ONEITERBUTTERFLY(3, temp3);

1110 
ONEITERBUTTERFLY(4, temp4);

1111 
ONEITERBUTTERFLY(5, temp5);

1112 
ONEITERBUTTERFLY(6, temp6);

1113 
ONEITERBUTTERFLY(7, temp7);

1114 
} 
1115 
#undef ONEITERBUTTERFLY

1116 
{ 
1117 
register vector signed int vsum; 
1118 
register vector signed short line0 = vec_add(temp0, temp1); 
1119 
register vector signed short line1 = vec_sub(temp0, temp1); 
1120 
register vector signed short line2 = vec_add(temp2, temp3); 
1121 
register vector signed short line3 = vec_sub(temp2, temp3); 
1122 
register vector signed short line4 = vec_add(temp4, temp5); 
1123 
register vector signed short line5 = vec_sub(temp4, temp5); 
1124 
register vector signed short line6 = vec_add(temp6, temp7); 
1125 
register vector signed short line7 = vec_sub(temp6, temp7); 
1126  
1127 
register vector signed short line0B = vec_add(line0, line2); 
1128 
register vector signed short line2B = vec_sub(line0, line2); 
1129 
register vector signed short line1B = vec_add(line1, line3); 
1130 
register vector signed short line3B = vec_sub(line1, line3); 
1131 
register vector signed short line4B = vec_add(line4, line6); 
1132 
register vector signed short line6B = vec_sub(line4, line6); 
1133 
register vector signed short line5B = vec_add(line5, line7); 
1134 
register vector signed short line7B = vec_sub(line5, line7); 
1135  
1136 
register vector signed short line0C = vec_add(line0B, line4B); 
1137 
register vector signed short line4C = vec_sub(line0B, line4B); 
1138 
register vector signed short line1C = vec_add(line1B, line5B); 
1139 
register vector signed short line5C = vec_sub(line1B, line5B); 
1140 
register vector signed short line2C = vec_add(line2B, line6B); 
1141 
register vector signed short line6C = vec_sub(line2B, line6B); 
1142 
register vector signed short line3C = vec_add(line3B, line7B); 
1143 
register vector signed short line7C = vec_sub(line3B, line7B); 
1144  
1145 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1146 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1147 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1148 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1149 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1150 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1151 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1152 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1153 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1154 
vsum = vec_splat(vsum, 3);

1155 
vec_ste(vsum, 0, &sum);

1156 
} 
1157 
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);

1158 
return sum;

1159 
} 
1160  
1161 
/*

1162 
16x8 works with 16 elements ; it allows to avoid replicating

1163 
loads, and give the compiler more rooms for scheduling.

1164 
It's only used from inside hadamard8_diff16_altivec.

1165 

1166 
Unfortunately, it seems gcc3.3 is a bit dumb, and

1167 
the compiled code has a LOT of spill code, it seems

1168 
gcc (unlike xlc) cannot keep everything in registers

1169 
by itself. The following code include handmade

1170 
registers allocation. It's not clean, but on

1171 
a 7450 the resulting code is much faster (best case

1172 
fall from 700+ cycles to 550).

1173 

1174 
xlc doesn't add spill code, but it doesn't know how to

1175 
schedule for the 7450, and its code isn't much faster than

1176 
gcc3.3 on the 7450 (but uses 25% less instructions...)

1177 

1178 
On the 970, the handmade RA is still a win (around 690

1179 
vs. around 780), but xlc goes to around 660 on the

1180 
regular C code...

1181 
*/

1182  
1183 
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { 
1184 
int sum;

1185 
register vector signed short 
1186 
temp0 REG_v(v0), 
1187 
temp1 REG_v(v1), 
1188 
temp2 REG_v(v2), 
1189 
temp3 REG_v(v3), 
1190 
temp4 REG_v(v4), 
1191 
temp5 REG_v(v5), 
1192 
temp6 REG_v(v6), 
1193 
temp7 REG_v(v7); 
1194 
register vector signed short 
1195 
temp0S REG_v(v8), 
1196 
temp1S REG_v(v9), 
1197 
temp2S REG_v(v10), 
1198 
temp3S REG_v(v11), 
1199 
temp4S REG_v(v12), 
1200 
temp5S REG_v(v13), 
1201 
temp6S REG_v(v14), 
1202 
temp7S REG_v(v15); 
1203 
register const vector unsigned char vzero REG_v(v31)= 
1204 
(const vector unsigned char)vec_splat_u8(0); 
1205 
{ 
1206 
register const vector signed short vprod1 REG_v(v16)= 
1207 
(const vector signed short)AVV( 1,1, 1,1, 1,1, 1,1); 
1208 
register const vector signed short vprod2 REG_v(v17)= 
1209 
(const vector signed short)AVV( 1, 1,1,1, 1, 1,1,1); 
1210 
register const vector signed short vprod3 REG_v(v18)= 
1211 
(const vector signed short)AVV( 1, 1, 1, 1,1,1,1,1); 
1212 
register const vector unsigned char perm1 REG_v(v19)= 
1213 
(const vector unsigned char) 
1214 
AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
1215 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); 
1216 
register const vector unsigned char perm2 REG_v(v20)= 
1217 
(const vector unsigned char) 
1218 
AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
1219 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); 
1220 
register const vector unsigned char perm3 REG_v(v21)= 
1221 
(const vector unsigned char) 
1222 
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
1223 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); 
1224  
1225 
#define ONEITERBUTTERFLY(i, res1, res2) \

1226 
{ \ 
1227 
register vector unsigned char src1 REG_v(v22), \ 
1228 
src2 REG_v(v23), \ 
1229 
dst1 REG_v(v24), \ 
1230 
dst2 REG_v(v25), \ 
1231 
srcO REG_v(v22), \ 
1232 
dstO REG_v(v23); \ 
1233 
\ 
1234 
register vector signed short srcV REG_v(v24), \ 
1235 
dstV REG_v(v25), \ 
1236 
srcW REG_v(v26), \ 
1237 
dstW REG_v(v27), \ 
1238 
but0 REG_v(v28), \ 
1239 
but0S REG_v(v29), \ 
1240 
op1 REG_v(v30), \ 
1241 
but1 REG_v(v22), \ 
1242 
op1S REG_v(v23), \ 
1243 
but1S REG_v(v24), \ 
1244 
op2 REG_v(v25), \ 
1245 
but2 REG_v(v26), \ 
1246 
op2S REG_v(v27), \ 
1247 
but2S REG_v(v28), \ 
1248 
op3 REG_v(v29), \ 
1249 
op3S REG_v(v30); \ 
1250 
\ 
1251 
src1 = vec_ld(stride * i, src); \ 
1252 
src2 = vec_ld((stride * i) + 16, src); \

1253 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1254 
dst1 = vec_ld(stride * i, dst); \ 
1255 
dst2 = vec_ld((stride * i) + 16, dst); \

1256 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1257 
/* promote the unsigned chars to signed shorts */ \

1258 
srcV = \ 
1259 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1260 
(vector signed char)srcO); \ 
1261 
dstV = \ 
1262 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1263 
(vector signed char)dstO); \ 
1264 
srcW = \ 
1265 
(vector signed short)vec_mergel((vector signed char)vzero, \ 
1266 
(vector signed char)srcO); \ 
1267 
dstW = \ 
1268 
(vector signed short)vec_mergel((vector signed char)vzero, \ 
1269 
(vector signed char)dstO); \ 
1270 
/* subtractions inside the first butterfly */ \

1271 
but0 = vec_sub(srcV, dstV); \ 
1272 
but0S = vec_sub(srcW, dstW); \ 
1273 
op1 = vec_perm(but0, but0, perm1); \ 
1274 
but1 = vec_mladd(but0, vprod1, op1); \ 
1275 
op1S = vec_perm(but0S, but0S, perm1); \ 
1276 
but1S = vec_mladd(but0S, vprod1, op1S); \ 
1277 
op2 = vec_perm(but1, but1, perm2); \ 
1278 
but2 = vec_mladd(but1, vprod2, op2); \ 
1279 
op2S = vec_perm(but1S, but1S, perm2); \ 
1280 
but2S = vec_mladd(but1S, vprod2, op2S); \ 
1281 
op3 = vec_perm(but2, but2, perm3); \ 
1282 
res1 = vec_mladd(but2, vprod3, op3); \ 
1283 
op3S = vec_perm(but2S, but2S, perm3); \ 
1284 
res2 = vec_mladd(but2S, vprod3, op3S); \ 
1285 
} 
1286 
ONEITERBUTTERFLY(0, temp0, temp0S);

1287 
ONEITERBUTTERFLY(1, temp1, temp1S);

1288 
ONEITERBUTTERFLY(2, temp2, temp2S);

1289 
ONEITERBUTTERFLY(3, temp3, temp3S);

1290 
ONEITERBUTTERFLY(4, temp4, temp4S);

1291 
ONEITERBUTTERFLY(5, temp5, temp5S);

1292 
ONEITERBUTTERFLY(6, temp6, temp6S);

1293 
ONEITERBUTTERFLY(7, temp7, temp7S);

1294 
} 
1295 
#undef ONEITERBUTTERFLY

1296 
{ 
1297 
register vector signed int vsum; 
1298 
register vector signed short line0S, line1S, line2S, line3S, line4S, 
1299 
line5S, line6S, line7S, line0BS,line2BS, 
1300 
line1BS,line3BS,line4BS,line6BS,line5BS, 
1301 
line7BS,line0CS,line4CS,line1CS,line5CS, 
1302 
line2CS,line6CS,line3CS,line7CS; 
1303  
1304 
register vector signed short line0 = vec_add(temp0, temp1); 
1305 
register vector signed short line1 = vec_sub(temp0, temp1); 
1306 
register vector signed short line2 = vec_add(temp2, temp3); 
1307 
register vector signed short line3 = vec_sub(temp2, temp3); 
1308 
register vector signed short line4 = vec_add(temp4, temp5); 
1309 
register vector signed short line5 = vec_sub(temp4, temp5); 
1310 
register vector signed short line6 = vec_add(temp6, temp7); 
1311 
register vector signed short line7 = vec_sub(temp6, temp7); 
1312  
1313 
register vector signed short line0B = vec_add(line0, line2); 
1314 
register vector signed short line2B = vec_sub(line0, line2); 
1315 
register vector signed short line1B = vec_add(line1, line3); 
1316 
register vector signed short line3B = vec_sub(line1, line3); 
1317 
register vector signed short line4B = vec_add(line4, line6); 
1318 
register vector signed short line6B = vec_sub(line4, line6); 
1319 
register vector signed short line5B = vec_add(line5, line7); 
1320 
register vector signed short line7B = vec_sub(line5, line7); 
1321  
1322 
register vector signed short line0C = vec_add(line0B, line4B); 
1323 
register vector signed short line4C = vec_sub(line0B, line4B); 
1324 
register vector signed short line1C = vec_add(line1B, line5B); 
1325 
register vector signed short line5C = vec_sub(line1B, line5B); 
1326 
register vector signed short line2C = vec_add(line2B, line6B); 
1327 
register vector signed short line6C = vec_sub(line2B, line6B); 
1328 
register vector signed short line3C = vec_add(line3B, line7B); 
1329 
register vector signed short line7C = vec_sub(line3B, line7B); 
1330  
1331 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1332 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1333 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1334 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1335 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1336 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1337 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1338 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1339  
1340 
line0S = vec_add(temp0S, temp1S); 
1341 
line1S = vec_sub(temp0S, temp1S); 
1342 
line2S = vec_add(temp2S, temp3S); 
1343 
line3S = vec_sub(temp2S, temp3S); 
1344 
line4S = vec_add(temp4S, temp5S); 
1345 
line5S = vec_sub(temp4S, temp5S); 
1346 
line6S = vec_add(temp6S, temp7S); 
1347 
line7S = vec_sub(temp6S, temp7S); 
1348  
1349 
line0BS = vec_add(line0S, line2S); 
1350 
line2BS = vec_sub(line0S, line2S); 
1351 
line1BS = vec_add(line1S, line3S); 
1352 
line3BS = vec_sub(line1S, line3S); 
1353 
line4BS = vec_add(line4S, line6S); 
1354 
line6BS = vec_sub(line4S, line6S); 
1355 
line5BS = vec_add(line5S, line7S); 
1356 
line7BS = vec_sub(line5S, line7S); 
1357  
1358 
line0CS = vec_add(line0BS, line4BS); 
1359 
line4CS = vec_sub(line0BS, line4BS); 
1360 
line1CS = vec_add(line1BS, line5BS); 
1361 
line5CS = vec_sub(line1BS, line5BS); 
1362 
line2CS = vec_add(line2BS, line6BS); 
1363 
line6CS = vec_sub(line2BS, line6BS); 
1364 
line3CS = vec_add(line3BS, line7BS); 
1365 
line7CS = vec_sub(line3BS, line7BS); 
1366  
1367 
vsum = vec_sum4s(vec_abs(line0CS), vsum); 
1368 
vsum = vec_sum4s(vec_abs(line1CS), vsum); 
1369 
vsum = vec_sum4s(vec_abs(line2CS), vsum); 
1370 
vsum = vec_sum4s(vec_abs(line3CS), vsum); 
1371 
vsum = vec_sum4s(vec_abs(line4CS), vsum); 
1372 
vsum = vec_sum4s(vec_abs(line5CS), vsum); 
1373 
vsum = vec_sum4s(vec_abs(line6CS), vsum); 
1374 
vsum = vec_sum4s(vec_abs(line7CS), vsum); 
1375 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1376 
vsum = vec_splat(vsum, 3);

1377 
vec_ste(vsum, 0, &sum);

1378 
} 
1379 
return sum;

1380 
} 
1381  
1382 
int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
1383 
POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);

1384 
int score;

1385 
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);

1386 
score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1387 
if (h==16) { 
1388 
dst += 8*stride;

1389 
src += 8*stride;

1390 
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1391 
} 
1392 
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);

1393 
return score;

1394 
} 
1395  
1396 
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, 
1397 
int blocksize)

1398 
{ 
1399 
int i;

1400 
vector float m, a;

1401 
vector bool int t0, t1; 
1402 
const vector unsigned int v_31 = //XXX 
1403 
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); 
1404 
for(i=0; i<blocksize; i+=4) { 
1405 
m = vec_ld(0, mag+i);

1406 
a = vec_ld(0, ang+i);

1407 
t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); 
1408 
t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); 
1409 
a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); 
1410 
t0 = (vector bool int)vec_and(a, t1); 
1411 
t1 = (vector bool int)vec_andc(a, t1); 
1412 
a = vec_sub(m, (vector float)t1);

1413 
m = vec_add(m, (vector float)t0);

1414 
vec_stl(a, 0, ang+i);

1415 
vec_stl(m, 0, mag+i);

1416 
} 
1417 
} 
1418  
1419 
/* next one assumes that ((line_size % 8) == 0) */

1420 
void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
1421 
{ 
1422 
POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);

1423 
register int i; 
1424 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
1425 
register vector unsigned char blockv, temp1, temp2, blocktemp; 
1426 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
1427  
1428 
register const vector unsigned char vczero = (const vector unsigned char) 
1429 
vec_splat_u8(0);

1430 
register const vector unsigned short vctwo = (const vector unsigned short) 
1431 
vec_splat_u16(2);

1432  
1433 
temp1 = vec_ld(0, pixels);

1434 
temp2 = vec_ld(16, pixels);

1435 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1436 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
1437 
pixelsv2 = temp2; 
1438 
} else {

1439 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1440 
} 
1441 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1442 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1443 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1444 
(vector unsigned short)pixelsv2); 
1445 
pixelssum1 = vec_add(pixelssum1, vctwo); 
1446  
1447 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);

1448 
for (i = 0; i < h ; i++) { 
1449 
int rightside = ((unsigned long)block & 0x0000000F); 
1450 
blockv = vec_ld(0, block);

1451  
1452 
temp1 = vec_ld(line_size, pixels); 
1453 
temp2 = vec_ld(line_size + 16, pixels);

1454 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1455 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1456 
{ 
1457 
pixelsv2 = temp2; 
1458 
} else {

1459 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1460 
} 
1461  
1462 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1463 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1464 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1465 
(vector unsigned short)pixelsv2); 
1466 
temp3 = vec_add(pixelssum1, pixelssum2); 
1467 
temp3 = vec_sra(temp3, vctwo); 
1468 
pixelssum1 = vec_add(pixelssum2, vctwo); 
1469 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
1470  
1471 
if (rightside) {

1472 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
1473 
} else {

1474 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
1475 
} 
1476  
1477 
blockv = vec_avg(blocktemp, blockv); 
1478 
vec_st(blockv, 0, block);

1479  
1480 
block += line_size; 
1481 
pixels += line_size; 
1482 
} 
1483  
1484 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);

1485 
} 
1486  
1487 
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)

1488 
{ 
1489 
c>pix_abs[0][1] = sad16_x2_altivec; 
1490 
c>pix_abs[0][2] = sad16_y2_altivec; 
1491 
c>pix_abs[0][3] = sad16_xy2_altivec; 
1492 
c>pix_abs[0][0] = sad16_altivec; 
1493 
c>pix_abs[1][0] = sad8_altivec; 
1494 
c>sad[0]= sad16_altivec;

1495 
c>sad[1]= sad8_altivec;

1496 
c>pix_norm1 = pix_norm1_altivec; 
1497 
c>sse[1]= sse8_altivec;

1498 
c>sse[0]= sse16_altivec;

1499 
c>pix_sum = pix_sum_altivec; 
1500 
c>diff_pixels = diff_pixels_altivec; 
1501 
c>get_pixels = get_pixels_altivec; 
1502 
c>add_bytes= add_bytes_altivec; 
1503 
c>put_pixels_tab[0][0] = put_pixels16_altivec; 
1504 
/* the two functions do the same thing, so use the same code */

1505 
c>put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 
1506 
c>avg_pixels_tab[0][0] = avg_pixels16_altivec; 
1507 
c>avg_pixels_tab[1][0] = avg_pixels8_altivec; 
1508 
c>avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 
1509 
c>put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 
1510 
c>put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 
1511 
c>put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 
1512 
c>put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 
1513  
1514 
c>hadamard8_diff[0] = hadamard8_diff16_altivec;

1515 
c>hadamard8_diff[1] = hadamard8_diff8x8_altivec;

1516 
if (ENABLE_VORBIS_DECODER)

1517 
c>vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; 
1518 
} 