/*


* Copyright (c) 2002 Brian Foley

* Copyright (c) 2002 Dieter Shirley

* Copyright (c) 20032004 Romain Dolbeau <romain@dolbeau.org>

*

* This file is part of Libav.

*

* Libav is free software; you can redistribute it and/or

* modify it under the terms of the GNU Lesser General Public

* License as published by the Free Software Foundation; either

* version 2.1 of the License, or (at your option) any later version.

*

* Libav is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

* Lesser General Public License for more details.

*

* You should have received a copy of the GNU Lesser General Public

* License along with Libav; if not, write to the Free Software

* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

*/

#include "config.h" 
#if HAVE_ALTIVEC_H

#include <altivec.h> 
#endif

#include "libavcodec/dsputil.h" 
#include "util_altivec.h" 
#include "types_altivec.h" 
#include "dsputil_altivec.h" 
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector unsigned char *tv; 
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
s = 0;

sad = (vector unsigned int)vec_splat_u32(0); 
for (i = 0; i < h; i++) { 
/* Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16] */

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
tv = (vector unsigned char *) &pix2[1]; 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
/* Calculate the average vector */

avgv = vec_avg(pix2v, pix2iv); 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector unsigned char *tv; 
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
uint8_t *pix3 = pix2 + line_size; 
s = 0;

sad = (vector unsigned int)vec_splat_u32(0); 
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one

iteration becomes pix2 in the next iteration. We can use this

fact to avoid a potentially expensive unaligned read, each

time around the loop.

Read unaligned pixels into our vectors. The vectors are as follows:

pix2v: pix2[0]pix2[15]

Split the pixel vectors into shorts */

tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
for (i = 0; i < h; i++) { 
/* Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix3v: pix3[0]pix3[15] */

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
tv = (vector unsigned char *) &pix3[0]; 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
/* Calculate the average vector */

avgv = vec_avg(pix2v, pix3v); 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
pix1 += line_size; 
pix2v = pix3v; 
pix3 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

uint8_t *pix3 = pix2 + line_size; 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); 
vector unsigned char *tv, avgv, t5; 
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 
vector unsigned short avghv, avglv; 
vector unsigned short t1, t2, t3, t4; 
vector unsigned int sad; 
vector signed int sumdiffs; 
sad = (vector unsigned int)vec_splat_u32(0); 
s = 0;

/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one

iteration becomes pix2 in the next iteration. We can use this

fact to avoid a potentially expensive unaligned read, as well

as some splitting, and vector addition each time around the loop.

Read unaligned pixels into our vectors. The vectors are as follows:

pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

Split the pixel vectors into shorts */

tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
tv = (vector unsigned char *) &pix2[1]; 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 
t1 = vec_add(pix2hv, pix2ihv); 
t2 = vec_add(pix2lv, pix2ilv); 
for (i = 0; i < h; i++) { 
/* Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix3v: pix3[0]pix3[15] pix3iv: pix3[1]pix3[16] */

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
tv = (vector unsigned char *) &pix3[0]; 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
tv = (vector unsigned char *) &pix3[1]; 
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 
/* Note that AltiVec does have vec_avg, but this works on vector pairs

and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding

would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.

Instead, we have to split the pixel vectors into vectors of shorts,

and do the averaging by hand. */

/* Split the pixel vectors into shorts */

pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 
/* Do the averaging on them */

t3 = vec_add(pix3hv, pix3ihv); 
t4 = vec_add(pix3lv, pix3ilv); 
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 
/* Pack the shorts back into a result */

avgv = vec_pack(avghv, avglv); 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
pix1 += line_size; 
pix3 += line_size; 
/* Transfer the calculated values for pix3 into pix2 */

t1 = t3; 
t2 = t4; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, pix1v_low, pix1v_high, pix2v_low, pix2v_high; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
sad = (vector unsigned int)vec_splat_u32(0); 
for (i = 0; i < h; i++) { 
/* Read potentially unaligned pixels into t1 and t2 */

perm1 = vec_lvsl(0, pix1);

pix1v_high = vec_ld( 0, pix1);

pix1v_low = vec_ld(15, pix1);

perm2 = vec_lvsl(0, pix2);

pix2v_high = vec_ld( 0, pix2);

pix2v_low = vec_ld(15, pix2);

t1 = vec_perm(pix1v_high, pix1v_low, perm1); 
t2 = vec_perm(pix2v_high, pix2v_low, perm2); 
/* Calculate a sum of abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
sad = (vector unsigned int)vec_splat_u32(0); 
permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 
for (i = 0; i < h; i++) { 
/* Read potentially unaligned pixels into t1 and t2

Since we're reading 16 pixels, and actually only want 8,

mask out the last 8 pixels. The 0s don't change the sum. */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
/* Calculate a sum of abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
static int pix_norm1_altivec(uint8_t *pix, int line_size) 
{ 
int i;

int s;

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char *tv; 
vector unsigned char pixv; 
vector unsigned int sv; 
vector signed int sum; 
sv = (vector unsigned int)vec_splat_u32(0); 
s = 0;

for (i = 0; i < 16; i++) { 
/* Read in the potentially unaligned pixels */

tv = (vector unsigned char *) pix; 
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 
/* Square the values, and add them to our sum */

sv = vec_msum(pixv, pixv, sv); 
pix += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sum = vec_sums((vector signed int) sv, (vector signed int) zero); 
sum = vec_splat(sum, 3);

vec_ste(sum, 0, &s);

return s;

} 
/**

* Sum of Squared Errors for a 8x8 block.

* AltiVecenhanced.

* It's the sad8_altivec code above w/ squaring added.

*/

static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sum; 
vector signed int sumsqr; 
sum = (vector unsigned int)vec_splat_u32(0); 
permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 
for (i = 0; i < h; i++) { 
/* Read potentially unaligned pixels into t1 and t2

Since we're reading 16 pixels, and actually only want 8,

mask out the last 8 pixels. The 0s don't change the sum. */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
/* Since we want to use unsigned chars, we can take advantage

of the fact that abs(ab)^2 = (ab)^2. */

/* Calculate abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
/* Square the values and add them to our sum */

sum = vec_msum(t5, t5, sum); 
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
sumsqr = vec_splat(sumsqr, 3);

vec_ste(sumsqr, 0, &s);

return s;

} 
/**

* Sum of Squared Errors for a 16x16 block.

* AltiVecenhanced.

* It's the sad16_altivec code above w/ squaring added.

*/

static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s;

const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sum; 
vector signed int sumsqr; 
sum = (vector unsigned int)vec_splat_u32(0); 
for (i = 0; i < h; i++) { 
/* Read potentially unaligned pixels into t1 and t2 */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
/* Since we want to use unsigned chars, we can take advantage

of the fact that abs(ab)^2 = (ab)^2. */

/* Calculate abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
/* Square the values and add them to our sum */

sum = vec_msum(t5, t5, sum); 
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
sumsqr = vec_splat(sumsqr, 3);

vec_ste(sumsqr, 0, &s);

return s;

} 
static int pix_sum_altivec(uint8_t * pix, int line_size) 
{ 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm, *pixv; 
vector unsigned char t1; 
vector unsigned int sad; 
vector signed int sumdiffs; 
int i;

int s;

sad = (vector unsigned int)vec_splat_u32(0); 
for (i = 0; i < 16; i++) { 
/* Read the potentially unaligned 16 pixels into t1 */

perm = vec_lvsl(0, pix);

pixv = (vector unsigned char *) pix; 
t1 = vec_perm(pixv[0], pixv[1], perm); 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t1, sad); 
pix += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 
{ 
int i;

vector unsigned char perm, bytes, *pixv; 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector signed short shorts; 
for (i = 0; i < 8; i++) { 
// Read potentially unaligned pixels.

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, pixels);

pixv = (vector unsigned char *) pixels; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
// convert the bytes into shorts

shorts = (vector signed short)vec_mergeh(zero, bytes); 
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts, i*16, (vector signed short*)block); 
pixels += line_size; 
} 
} 
static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 
const uint8_t *s2, int stride) 
{ 
int i;

vector unsigned char perm, bytes, *pixv; 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
vector signed short shorts1, shorts2; 
for (i = 0; i < 4; i++) { 
// Read potentially unaligned pixels

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, s1);

pixv = (vector unsigned char *) s1; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
// convert the bytes into shorts

shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
// Do the same for the second block of pixels

perm = vec_lvsl(0, s2);

pixv = (vector unsigned char *) s2; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
// convert the bytes into shorts

shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
// Do the subtraction

shorts1 = vec_sub(shorts1, shorts2); 
539 
540 
541  
s1 += stride; 
s2 += stride; 
block += 8;

// The code below is a copy of the code above... This is a manual

// unroll.

// Read potentially unaligned pixels

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, s1);

pixv = (vector unsigned char *) s1; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
// convert the bytes into shorts

shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
// Do the same for the second block of pixels

perm = vec_lvsl(0, s2);

pixv = (vector unsigned char *) s2; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
// convert the bytes into shorts

shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
// Do the subtraction

shorts1 = vec_sub(shorts1, shorts2); 
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts1, 0, (vector signed short*)block); 
s1 += stride; 
s2 += stride; 
block += 8;

} 
} 
static void clear_block_altivec(DCTELEM *block) { 
LOAD_ZERO; 
vec_st(zero_s16v, 0, block);

vec_st(zero_s16v, 16, block);

vec_st(zero_s16v, 32, block);

vec_st(zero_s16v, 48, block);

vec_st(zero_s16v, 64, block);

vec_st(zero_s16v, 80, block);

vec_st(zero_s16v, 96, block);

vec_st(zero_s16v, 112, block);

} 
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 
register int i; 
register vector unsigned char vdst, vsrc; 
/* dst and src are 16 bytesaligned (guaranteed) */

for (i = 0 ; (i + 15) < w ; i+=16) { 
vdst = vec_ld(i, (unsigned char*)dst); 
vsrc = vec_ld(i, (unsigned char*)src); 
vdst = vec_add(vsrc, vdst); 
vec_st(vdst, i, (unsigned char*)dst); 
} 
/* if w is not a multiple of 16 */

for (; (i < w) ; i++) {

dst[i] = src[i]; 
} 
} 
/* next one assumes that ((line_size % 16) == 0) */

void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
{ 
register vector unsigned char pixelsv1, pixelsv2; 
register vector unsigned char pixelsv1B, pixelsv2B; 
616 
register vector unsigned char pixelsv1C, pixelsv2C; 
617 
register vector unsigned char pixelsv1D, pixelsv2D; 
618  
619 
register vector unsigned char perm = vec_lvsl(0, pixels); 
620 
int i;

621 
register int line_size_2 = line_size << 1; 
622 
register int line_size_3 = line_size + line_size_2; 
623 
register int line_size_4 = line_size << 2; 
624  
625 
// handunrolling the loop by 4 gains about 15%

626 
// mininum execution time goes from 74 to 60 cycles

627 
// it's faster than funrollloops, but using

628 
// funrollloops w/ this is bad  74 cycles again.

629 
// all this is on a 7450, tuning for the 7450

630 
#if 0

631 
for (i = 0; i < h; i++) {

632 
pixelsv1 = vec_ld(0, pixels);

633 
pixelsv2 = vec_ld(16, pixels);

634 
vec_st(vec_perm(pixelsv1, pixelsv2, perm),

635 
0, block);

636 
pixels+=line_size;

637 
block +=line_size;

638 
}

639 
#else

640 
for (i = 0; i < h; i += 4) { 
641 
pixelsv1 = vec_ld( 0, pixels);

642 
pixelsv2 = vec_ld(15, pixels);

643 
pixelsv1B = vec_ld(line_size, pixels); 
644 
pixelsv2B = vec_ld(15 + line_size, pixels);

645 
pixelsv1C = vec_ld(line_size_2, pixels); 
646 
pixelsv2C = vec_ld(15 + line_size_2, pixels);

647 
pixelsv1D = vec_ld(line_size_3, pixels); 
648 
pixelsv2D = vec_ld(15 + line_size_3, pixels);

649 
vec_st(vec_perm(pixelsv1, pixelsv2, perm), 
650 
0, (unsigned char*)block); 
651 
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 
652 
line_size, (unsigned char*)block); 
653 
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 
654 
line_size_2, (unsigned char*)block); 
655 
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 
656 
line_size_3, (unsigned char*)block); 
657 
pixels+=line_size_4; 
658 
block +=line_size_4; 
659 
} 
660 
#endif

661 
} 
662  
663 
/* next one assumes that ((line_size % 16) == 0) */

664 
#define op_avg(a,b) a = ( ((a)(b))  ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 
665 
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
666 
{ 
667 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
668 
register vector unsigned char perm = vec_lvsl(0, pixels); 
669 
int i;

670  
671 
for (i = 0; i < h; i++) { 
672 
pixelsv1 = vec_ld( 0, pixels);

673 
pixelsv2 = vec_ld(16,pixels);

674 
blockv = vec_ld(0, block);

675 
pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 
676 
blockv = vec_avg(blockv,pixelsv); 
677 
vec_st(blockv, 0, (unsigned char*)block); 
678 
pixels+=line_size; 
679 
block +=line_size; 
680 
} 
681 
} 
682  
683 
/* next one assumes that ((line_size % 8) == 0) */

684 
static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
685 
{ 
686 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
687 
int i;

688  
689 
for (i = 0; i < h; i++) { 
690 
/* block is 8 bytesaligned, so we're either in the

691 
left block (16 bytesaligned) or in the right block (not) */

692 
int rightside = ((unsigned long)block & 0x0000000F); 
693  
694 
blockv = vec_ld(0, block);

695 
pixelsv1 = vec_ld( 0, pixels);

696 
pixelsv2 = vec_ld(16, pixels);

697 
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

698  
699 
if (rightside) {

700 
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 
701 
} else {

702 
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 
703 
} 
704  
705 
blockv = vec_avg(blockv, pixelsv); 
706  
707 
vec_st(blockv, 0, block);

708  
709 
pixels += line_size; 
710 
block += line_size; 
711 
} 
712 
} 
713  
714 
/* next one assumes that ((line_size % 8) == 0) */

715 
static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
716 
{ 
717 
register int i; 
718 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
719 
register vector unsigned char blockv, temp1, temp2; 
720 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
721 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
722 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
723  
724 
temp1 = vec_ld(0, pixels);

725 
temp2 = vec_ld(16, pixels);

726 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

727 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
728 
pixelsv2 = temp2; 
729 
} else {

730 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

731 
} 
732 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
733 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
734 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
735 
(vector unsigned short)pixelsv2); 
736 
pixelssum1 = vec_add(pixelssum1, vctwo); 
737  
738 
for (i = 0; i < h ; i++) { 
739 
int rightside = ((unsigned long)block & 0x0000000F); 
740 
blockv = vec_ld(0, block);

741  
742 
temp1 = vec_ld(line_size, pixels); 
743 
temp2 = vec_ld(line_size + 16, pixels);

744 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
745 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
746 
pixelsv2 = temp2; 
747 
} else {

748 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

749 
} 
750  
751 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
752 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
753 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
754 
(vector unsigned short)pixelsv2); 
755 
temp3 = vec_add(pixelssum1, pixelssum2); 
756 
temp3 = vec_sra(temp3, vctwo); 
757 
pixelssum1 = vec_add(pixelssum2, vctwo); 
758 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
759  
760 
if (rightside) {

761 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
762 
} else {

763 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
764 
} 
765  
766 
vec_st(blockv, 0, block);

767  
768 
block += line_size; 
769 
pixels += line_size; 
770 
} 
771 
} 
772  
773 
/* next one assumes that ((line_size % 8) == 0) */

774 
static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
775 
{ 
776 
register int i; 
777 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
778 
register vector unsigned char blockv, temp1, temp2; 
779 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
780 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
781 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
782 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
783  
784 
temp1 = vec_ld(0, pixels);

785 
temp2 = vec_ld(16, pixels);

786 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

787 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
788 
pixelsv2 = temp2; 
789 
} else {

790 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

791 
} 
792 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
793 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
794 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
795 
(vector unsigned short)pixelsv2); 
796 
pixelssum1 = vec_add(pixelssum1, vcone); 
797  
798 
for (i = 0; i < h ; i++) { 
799 
int rightside = ((unsigned long)block & 0x0000000F); 
800 
blockv = vec_ld(0, block);

801  
802 
temp1 = vec_ld(line_size, pixels); 
803 
temp2 = vec_ld(line_size + 16, pixels);

804 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
805 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
806 
pixelsv2 = temp2; 
807 
} else {

808 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

809 
} 
810  
811 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
812 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
813 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
814 
(vector unsigned short)pixelsv2); 
815 
temp3 = vec_add(pixelssum1, pixelssum2); 
816 
temp3 = vec_sra(temp3, vctwo); 
817 
pixelssum1 = vec_add(pixelssum2, vcone); 
818 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
819  
820 
if (rightside) {

821 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
822 
} else {

823 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
824 
} 
825  
826 
vec_st(blockv, 0, block);

827  
828 
block += line_size; 
829 
pixels += line_size; 
830 
} 
831 
} 
832  
833 
/* next one assumes that ((line_size % 16) == 0) */

834 
static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
835 
{ 
836 
register int i; 
837 
register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
838 
register vector unsigned char blockv, temp1, temp2; 
839 
register vector unsigned short temp3, temp4, 
840 
pixelssum1, pixelssum2, pixelssum3, pixelssum4; 
841 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
842 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
843  
844 
temp1 = vec_ld(0, pixels);

845 
temp2 = vec_ld(16, pixels);

846 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

847 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
848 
pixelsv2 = temp2; 
849 
} else {

850 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

851 
} 
852 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
853 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
854 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
855 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
856 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
857 
(vector unsigned short)pixelsv4); 
858 
pixelssum3 = vec_add(pixelssum3, vctwo); 
859 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
860 
(vector unsigned short)pixelsv2); 
861 
pixelssum1 = vec_add(pixelssum1, vctwo); 
862  
863 
for (i = 0; i < h ; i++) { 
864 
blockv = vec_ld(0, block);

865  
866 
temp1 = vec_ld(line_size, pixels); 
867 
temp2 = vec_ld(line_size + 16, pixels);

868 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
869 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
870 
pixelsv2 = temp2; 
871 
} else {

872 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

873 
} 
874  
875 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
876 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
877 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
878 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
879  
880 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
881 
(vector unsigned short)pixelsv4); 
882 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
883 
(vector unsigned short)pixelsv2); 
884 
temp4 = vec_add(pixelssum3, pixelssum4); 
885 
temp4 = vec_sra(temp4, vctwo); 
886 
temp3 = vec_add(pixelssum1, pixelssum2); 
887 
temp3 = vec_sra(temp3, vctwo); 
888  
889 
pixelssum3 = vec_add(pixelssum4, vctwo); 
890 
pixelssum1 = vec_add(pixelssum2, vctwo); 
891  
892 
blockv = vec_packsu(temp3, temp4); 
893  
894 
vec_st(blockv, 0, block);

895  
896 
block += line_size; 
897 
pixels += line_size; 
898 
} 
899 
} 
900  
901 
/* next one assumes that ((line_size % 16) == 0) */

902 
static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
903 
{ 
904 
register int i; 
905 
register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
906 
register vector unsigned char blockv, temp1, temp2; 
907 
register vector unsigned short temp3, temp4, 
908 
pixelssum1, pixelssum2, pixelssum3, pixelssum4; 
909 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
910 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
911 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
912  
913 
temp1 = vec_ld(0, pixels);

914 
temp2 = vec_ld(16, pixels);

915 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

916 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
917 
pixelsv2 = temp2; 
918 
} else {

919 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

920 
} 
921 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
922 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
923 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
924 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
925 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
926 
(vector unsigned short)pixelsv4); 
927 
pixelssum3 = vec_add(pixelssum3, vcone); 
928 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
929 
(vector unsigned short)pixelsv2); 
930 
pixelssum1 = vec_add(pixelssum1, vcone); 
931  
932 
for (i = 0; i < h ; i++) { 
933 
blockv = vec_ld(0, block);

934  
935 
temp1 = vec_ld(line_size, pixels); 
936 
temp2 = vec_ld(line_size + 16, pixels);

937 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
938 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
939 
pixelsv2 = temp2; 
940 
} else {

941 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

942 
} 
943  
944 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
945 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
946 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
947 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
948  
949 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
950 
(vector unsigned short)pixelsv4); 
951 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
952 
(vector unsigned short)pixelsv2); 
953 
temp4 = vec_add(pixelssum3, pixelssum4); 
954 
temp4 = vec_sra(temp4, vctwo); 
955 
temp3 = vec_add(pixelssum1, pixelssum2); 
956 
temp3 = vec_sra(temp3, vctwo); 
957  
958 
pixelssum3 = vec_add(pixelssum4, vcone); 
959 
pixelssum1 = vec_add(pixelssum2, vcone); 
960  
961 
blockv = vec_packsu(temp3, temp4); 
962  
963 
vec_st(blockv, 0, block);

964  
965 
block += line_size; 
966 
pixels += line_size; 
967 
} 
968 
} 
969  
970 
static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
971 
int sum;

972 
register const vector unsigned char vzero = 
973 
(const vector unsigned char)vec_splat_u8(0); 
974 
register vector signed short temp0, temp1, temp2, temp3, temp4, 
975 
temp5, temp6, temp7; 
976 
{ 
977 
register const vector signed short vprod1 =(const vector signed short) 
978 
{ 1,1, 1,1, 1,1, 1,1 }; 
979 
register const vector signed short vprod2 =(const vector signed short) 
980 
{ 1, 1,1,1, 1, 1,1,1 }; 
981 
register const vector signed short vprod3 =(const vector signed short) 
982 
{ 1, 1, 1, 1,1,1,1,1 }; 
983 
register const vector unsigned char perm1 = (const vector unsigned char) 
984 
{0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
985 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 
986 
register const vector unsigned char perm2 = (const vector unsigned char) 
987 
{0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
988 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 
989 
register const vector unsigned char perm3 = (const vector unsigned char) 
990 
{0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
991 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 
992  
993 
#define ONEITERBUTTERFLY(i, res) \

994 
{ \ 
995 
register vector unsigned char src1, src2, srcO; \ 
996 
register vector unsigned char dst1, dst2, dstO; \ 
997 
register vector signed short srcV, dstV; \ 
998 
register vector signed short but0, but1, but2, op1, op2, op3; \ 
999 
src1 = vec_ld(stride * i, src); \ 
1000 
src2 = vec_ld((stride * i) + 15, src); \

1001 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1002 
dst1 = vec_ld(stride * i, dst); \ 
1003 
dst2 = vec_ld((stride * i) + 15, dst); \

1004 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1005 
/* promote the unsigned chars to signed shorts */ \

1006 
/* we're in the 8x8 function, we only care for the first 8 */ \

1007 
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1008 
(vector signed char)srcO); \ 
1009 
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1010 
(vector signed char)dstO); \ 
1011 
/* subtractions inside the first butterfly */ \

1012 
but0 = vec_sub(srcV, dstV); \ 
1013 
op1 = vec_perm(but0, but0, perm1); \ 
1014 
but1 = vec_mladd(but0, vprod1, op1); \ 
1015 
op2 = vec_perm(but1, but1, perm2); \ 
1016 
but2 = vec_mladd(but1, vprod2, op2); \ 
1017 
op3 = vec_perm(but2, but2, perm3); \ 
1018 
res = vec_mladd(but2, vprod3, op3); \ 
1019 
} 
1020 
ONEITERBUTTERFLY(0, temp0);

1021 
ONEITERBUTTERFLY(1, temp1);

1022 
ONEITERBUTTERFLY(2, temp2);

1023 
ONEITERBUTTERFLY(3, temp3);

1024 
ONEITERBUTTERFLY(4, temp4);

1025 
ONEITERBUTTERFLY(5, temp5);

1026 
ONEITERBUTTERFLY(6, temp6);

1027 
ONEITERBUTTERFLY(7, temp7);

1028 
} 
1029 
#undef ONEITERBUTTERFLY

1030 
{ 
1031 
register vector signed int vsum; 
1032 
register vector signed short line0 = vec_add(temp0, temp1); 
1033 
register vector signed short line1 = vec_sub(temp0, temp1); 
1034 
register vector signed short line2 = vec_add(temp2, temp3); 
1035 
register vector signed short line3 = vec_sub(temp2, temp3); 
1036 
register vector signed short line4 = vec_add(temp4, temp5); 
1037 
register vector signed short line5 = vec_sub(temp4, temp5); 
1038 
register vector signed short line6 = vec_add(temp6, temp7); 
1039 
register vector signed short line7 = vec_sub(temp6, temp7); 
1040  
1041 
register vector signed short line0B = vec_add(line0, line2); 
1042 
register vector signed short line2B = vec_sub(line0, line2); 
1043 
register vector signed short line1B = vec_add(line1, line3); 
1044 
register vector signed short line3B = vec_sub(line1, line3); 
1045 
register vector signed short line4B = vec_add(line4, line6); 
1046 
register vector signed short line6B = vec_sub(line4, line6); 
1047 
register vector signed short line5B = vec_add(line5, line7); 
1048 
register vector signed short line7B = vec_sub(line5, line7); 
1049  
1050 
register vector signed short line0C = vec_add(line0B, line4B); 
1051 
register vector signed short line4C = vec_sub(line0B, line4B); 
1052 
register vector signed short line1C = vec_add(line1B, line5B); 
1053 
register vector signed short line5C = vec_sub(line1B, line5B); 
1054 
register vector signed short line2C = vec_add(line2B, line6B); 
1055 
register vector signed short line6C = vec_sub(line2B, line6B); 
1056 
register vector signed short line3C = vec_add(line3B, line7B); 
1057 
register vector signed short line7C = vec_sub(line3B, line7B); 
1058  
1059 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1060 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1061 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1062 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1063 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1064 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1065 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1066 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1067 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1068 
vsum = vec_splat(vsum, 3);

1069 
vec_ste(vsum, 0, &sum);

1070 
} 
1071 
return sum;

1072 
} 
1073  
1074 
/*

1075 
16x8 works with 16 elements; it allows to avoid replicating loads, and

1076 
give the compiler more rooms for scheduling. It's only used from

1077 
inside hadamard8_diff16_altivec.

1078 

1079 
Unfortunately, it seems gcc3.3 is a bit dumb, and the compiled code has a LOT

1080 
of spill code, it seems gcc (unlike xlc) cannot keep everything in registers

1081 
by itself. The following code include handmade registers allocation. It's not

1082 
clean, but on a 7450 the resulting code is much faster (best case fall from

1083 
700+ cycles to 550).

1084 

1085 
xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,

1086 
and its code isn't much faster than gcc3.3 on the 7450 (but uses 25% less

1087 
instructions...)

1088 

1089 
On the 970, the handmade RA is still a win (around 690 vs. around 780), but

1090 
xlc goes to around 660 on the regular C code...

1091 
*/

1092  
1093 
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { 
1094 
int sum;

1095 
register vector signed short 
1096 
temp0 __asm__ ("v0"),

1097 
temp1 __asm__ ("v1"),

1098 
temp2 __asm__ ("v2"),

1099 
temp3 __asm__ ("v3"),

1100 
temp4 __asm__ ("v4"),

1101 
temp5 __asm__ ("v5"),

1102 
temp6 __asm__ ("v6"),

1103 
temp7 __asm__ ("v7");

1104 
register vector signed short 
1105 
temp0S __asm__ ("v8"),

1106 
temp1S __asm__ ("v9"),

1107 
temp2S __asm__ ("v10"),

1108 
temp3S __asm__ ("v11"),

1109 
temp4S __asm__ ("v12"),

1110 
temp5S __asm__ ("v13"),

1111 
temp6S __asm__ ("v14"),

1112 
temp7S __asm__ ("v15");

1113 
register const vector unsigned char vzero __asm__ ("v31") = 
1114 
(const vector unsigned char)vec_splat_u8(0); 
1115 
{ 
1116 
register const vector signed short vprod1 __asm__ ("v16") = 
1117 
(const vector signed short){ 1,1, 1,1, 1,1, 1,1 }; 
1118 
register const vector signed short vprod2 __asm__ ("v17") = 
1119 
(const vector signed short){ 1, 1,1,1, 1, 1,1,1 }; 
1120 
register const vector signed short vprod3 __asm__ ("v18") = 
1121 
(const vector signed short){ 1, 1, 1, 1,1,1,1,1 }; 
1122 
register const vector unsigned char perm1 __asm__ ("v19") = 
1123 
(const vector unsigned char) 
1124 
{0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
1125 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 
1126 
register const vector unsigned char perm2 __asm__ ("v20") = 
1127 
(const vector unsigned char) 
1128 
{0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
1129 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 
1130 
register const vector unsigned char perm3 __asm__ ("v21") = 
1131 
(const vector unsigned char) 
1132 
{0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
1133 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 
1134  
1135 
#define ONEITERBUTTERFLY(i, res1, res2) \

1136 
{ \ 
1137 
register vector unsigned char src1 __asm__ ("v22"), \ 
1138 
src2 __asm__ ("v23"), \

1139 
dst1 __asm__ ("v24"), \

1140 
dst2 __asm__ ("v25"), \

1141 
srcO __asm__ ("v22"), \

1142 
dstO __asm__ ("v23"); \

1143 
\ 
1144 
register vector signed short srcV __asm__ ("v24"), \ 
1145 
dstV __asm__ ("v25"), \

1146 
srcW __asm__ ("v26"), \

1147 
dstW __asm__ ("v27"), \

1148 
but0 __asm__ ("v28"), \

1149 
but0S __asm__ ("v29"), \

1150 
op1 __asm__ ("v30"), \

1151 
but1 __asm__ ("v22"), \

1152 
op1S __asm__ ("v23"), \

1153 
but1S __asm__ ("v24"), \

1154 
op2 __asm__ ("v25"), \

1155 
but2 __asm__ ("v26"), \

1156 
op2S __asm__ ("v27"), \

1157 
but2S __asm__ ("v28"), \

1158 
op3 __asm__ ("v29"), \

1159 
op3S __asm__ ("v30"); \

1160 
\ 
1161 
src1 = vec_ld(stride * i, src); \ 
1162 
src2 = vec_ld((stride * i) + 16, src); \

1163 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1164 
dst1 = vec_ld(stride * i, dst); \ 
1165 
dst2 = vec_ld((stride * i) + 16, dst); \

1166 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1167 
/* promote the unsigned chars to signed shorts */ \

1168 
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1169 
(vector signed char)srcO); \ 
1170 
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1171 
(vector signed char)dstO); \ 
1172 
srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ 
1173 
(vector signed char)srcO); \ 
1174 
dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ 
1175 
(vector signed char)dstO); \ 
1176 
/* subtractions inside the first butterfly */ \

1177 
but0 = vec_sub(srcV, dstV); \ 
1178 
but0S = vec_sub(srcW, dstW); \ 
1179 
op1 = vec_perm(but0, but0, perm1); \ 
1180 
but1 = vec_mladd(but0, vprod1, op1); \ 
1181 
op1S = vec_perm(but0S, but0S, perm1); \ 
1182 
but1S = vec_mladd(but0S, vprod1, op1S); \ 
1183 
op2 = vec_perm(but1, but1, perm2); \ 
1184 
but2 = vec_mladd(but1, vprod2, op2); \ 
1185 
op2S = vec_perm(but1S, but1S, perm2); \ 
1186 
but2S = vec_mladd(but1S, vprod2, op2S); \ 
1187 
op3 = vec_perm(but2, but2, perm3); \ 
1188 
res1 = vec_mladd(but2, vprod3, op3); \ 
1189 
op3S = vec_perm(but2S, but2S, perm3); \ 
1190 
res2 = vec_mladd(but2S, vprod3, op3S); \ 
1191 
} 
1192 
ONEITERBUTTERFLY(0, temp0, temp0S);

1193 
ONEITERBUTTERFLY(1, temp1, temp1S);

1194 
ONEITERBUTTERFLY(2, temp2, temp2S);

1195 
ONEITERBUTTERFLY(3, temp3, temp3S);

1196 
ONEITERBUTTERFLY(4, temp4, temp4S);

1197 
ONEITERBUTTERFLY(5, temp5, temp5S);

1198 
ONEITERBUTTERFLY(6, temp6, temp6S);

1199 
ONEITERBUTTERFLY(7, temp7, temp7S);

1200 
} 
1201 
#undef ONEITERBUTTERFLY

1202 
{ 
1203 
register vector signed int vsum; 
1204 
register vector signed short line0S, line1S, line2S, line3S, line4S, 
1205 
line5S, line6S, line7S, line0BS,line2BS, 
1206 
line1BS,line3BS,line4BS,line6BS,line5BS, 
1207 
line7BS,line0CS,line4CS,line1CS,line5CS, 
1208 
line2CS,line6CS,line3CS,line7CS; 
1209  
1210 
register vector signed short line0 = vec_add(temp0, temp1); 
1211 
register vector signed short line1 = vec_sub(temp0, temp1); 
1212 
register vector signed short line2 = vec_add(temp2, temp3); 
1213 
register vector signed short line3 = vec_sub(temp2, temp3); 
1214 
register vector signed short line4 = vec_add(temp4, temp5); 
1215 
register vector signed short line5 = vec_sub(temp4, temp5); 
1216 
register vector signed short line6 = vec_add(temp6, temp7); 
1217 
register vector signed short line7 = vec_sub(temp6, temp7); 
1218  
1219 
register vector signed short line0B = vec_add(line0, line2); 
1220 
register vector signed short line2B = vec_sub(line0, line2); 
1221 
register vector signed short line1B = vec_add(line1, line3); 
1222 
register vector signed short line3B = vec_sub(line1, line3); 
1223 
register vector signed short line4B = vec_add(line4, line6); 
1224 
register vector signed short line6B = vec_sub(line4, line6); 
1225 
register vector signed short line5B = vec_add(line5, line7); 
1226 
register vector signed short line7B = vec_sub(line5, line7); 
1227  
1228 
register vector signed short line0C = vec_add(line0B, line4B); 
1229 
register vector signed short line4C = vec_sub(line0B, line4B); 
1230 
register vector signed short line1C = vec_add(line1B, line5B); 
1231 
register vector signed short line5C = vec_sub(line1B, line5B); 
1232 
register vector signed short line2C = vec_add(line2B, line6B); 
1233 
register vector signed short line6C = vec_sub(line2B, line6B); 
1234 
register vector signed short line3C = vec_add(line3B, line7B); 
1235 
register vector signed short line7C = vec_sub(line3B, line7B); 
1236  
1237 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1238 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1239 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1240 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1241 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1242 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1243 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1244 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1245  
1246 
line0S = vec_add(temp0S, temp1S); 
1247 
line1S = vec_sub(temp0S, temp1S); 
1248 
line2S = vec_add(temp2S, temp3S); 
1249 
line3S = vec_sub(temp2S, temp3S); 
1250 
line4S = vec_add(temp4S, temp5S); 
1251 
line5S = vec_sub(temp4S, temp5S); 
1252 
line6S = vec_add(temp6S, temp7S); 
1253 
line7S = vec_sub(temp6S, temp7S); 
1254  
1255 
line0BS = vec_add(line0S, line2S); 
1256 
line2BS = vec_sub(line0S, line2S); 
1257 
line1BS = vec_add(line1S, line3S); 
1258 
line3BS = vec_sub(line1S, line3S); 
1259 
line4BS = vec_add(line4S, line6S); 
1260 
line6BS = vec_sub(line4S, line6S); 
1261 
line5BS = vec_add(line5S, line7S); 
1262 
line7BS = vec_sub(line5S, line7S); 
1263  
1264 
line0CS = vec_add(line0BS, line4BS); 
1265 
line4CS = vec_sub(line0BS, line4BS); 
1266 
line1CS = vec_add(line1BS, line5BS); 
1267 
line5CS = vec_sub(line1BS, line5BS); 
1268 
line2CS = vec_add(line2BS, line6BS); 
1269 
line6CS = vec_sub(line2BS, line6BS); 
1270 
line3CS = vec_add(line3BS, line7BS); 
1271 
line7CS = vec_sub(line3BS, line7BS); 
1272  
1273 
vsum = vec_sum4s(vec_abs(line0CS), vsum); 
1274 
vsum = vec_sum4s(vec_abs(line1CS), vsum); 
1275 
vsum = vec_sum4s(vec_abs(line2CS), vsum); 
1276 
vsum = vec_sum4s(vec_abs(line3CS), vsum); 
1277 
vsum = vec_sum4s(vec_abs(line4CS), vsum); 
1278 
vsum = vec_sum4s(vec_abs(line5CS), vsum); 
1279 
vsum = vec_sum4s(vec_abs(line6CS), vsum); 
1280 
vsum = vec_sum4s(vec_abs(line7CS), vsum); 
1281 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1282 
vsum = vec_splat(vsum, 3);

1283 
vec_ste(vsum, 0, &sum);

1284 
} 
1285 
return sum;

1286 
} 
1287  
1288 
static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
1289 
int score;

1290 
score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1291 
if (h==16) { 
1292 
dst += 8*stride;

1293 
src += 8*stride;

1294 
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1295 
} 
1296 
return score;

1297 
} 
1298  
1299 
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, 
1300 
int blocksize)

1301 
{ 
1302 
int i;

1303 
vector float m, a;

1304 
vector bool int t0, t1; 
1305 
const vector unsigned int v_31 = //XXX 
1306 
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); 
1307 
for (i = 0; i < blocksize; i += 4) { 
1308 
m = vec_ld(0, mag+i);

1309 
a = vec_ld(0, ang+i);

1310 
t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); 
1311 
t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); 
1312 
a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); 
1313 
t0 = (vector bool int)vec_and(a, t1); 
1314 
t1 = (vector bool int)vec_andc(a, t1); 
1315 
a = vec_sub(m, (vector float)t1);

1316 
m = vec_add(m, (vector float)t0);

1317 
vec_stl(a, 0, ang+i);

1318 
vec_stl(m, 0, mag+i);

1319 
} 
1320 
} 
1321  
1322 
/* next one assumes that ((line_size % 8) == 0) */

1323 
static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
1324 
{ 
1325 
register int i; 
1326 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
1327 
register vector unsigned char blockv, temp1, temp2, blocktemp; 
1328 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
1329  
1330 
register const vector unsigned char vczero = (const vector unsigned char) 
1331 
vec_splat_u8(0);

1332 
register const vector unsigned short vctwo = (const vector unsigned short) 
1333 
vec_splat_u16(2);

1334  
1335 
temp1 = vec_ld(0, pixels);

1336 
temp2 = vec_ld(16, pixels);

1337 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1338 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
1339 
pixelsv2 = temp2; 
1340 
} else {

1341 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1342 
} 
1343 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1344 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1345 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1346 
(vector unsigned short)pixelsv2); 
1347 
pixelssum1 = vec_add(pixelssum1, vctwo); 
1348  
1349 
for (i = 0; i < h ; i++) { 
1350 
int rightside = ((unsigned long)block & 0x0000000F); 
1351 
blockv = vec_ld(0, block);

1352  
1353 
temp1 = vec_ld(line_size, pixels); 
1354 
temp2 = vec_ld(line_size + 16, pixels);

1355 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1356 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
1357 
pixelsv2 = temp2; 
1358 
} else {

1359 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1360 
} 
1361  
1362 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1363 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1364 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1365 
(vector unsigned short)pixelsv2); 
1366 
temp3 = vec_add(pixelssum1, pixelssum2); 
1367 
temp3 = vec_sra(temp3, vctwo); 
1368 
pixelssum1 = vec_add(pixelssum2, vctwo); 
1369 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
1370  
1371 
if (rightside) {

1372 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
1373 
} else {

1374 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
1375 
} 
1376  
1377 
blockv = vec_avg(blocktemp, blockv); 
1378 
vec_st(blockv, 0, block);

1379  
1380 
block += line_size; 
1381 
pixels += line_size; 
1382 
} 
1383 
} 
1384  
1385 
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)

1386 
{ 
1387 
c>pix_abs[0][1] = sad16_x2_altivec; 
1388 
c>pix_abs[0][2] = sad16_y2_altivec; 
1389 
c>pix_abs[0][3] = sad16_xy2_altivec; 
1390 
c>pix_abs[0][0] = sad16_altivec; 
1391 
c>pix_abs[1][0] = sad8_altivec; 
1392 
c>sad[0]= sad16_altivec;

1393 
c>sad[1]= sad8_altivec;

1394 
c>pix_norm1 = pix_norm1_altivec; 
1395 
c>sse[1]= sse8_altivec;

1396 
c>sse[0]= sse16_altivec;

1397 
c>pix_sum = pix_sum_altivec; 
1398 
c>diff_pixels = diff_pixels_altivec; 
1399 
c>get_pixels = get_pixels_altivec; 
1400 
c>clear_block = clear_block_altivec; 
1401 
c>add_bytes= add_bytes_altivec; 
1402 
c>put_pixels_tab[0][0] = put_pixels16_altivec; 
1403 
/* the two functions do the same thing, so use the same code */

1404 
c>put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 
1405 
c>avg_pixels_tab[0][0] = avg_pixels16_altivec; 
1406 
c>avg_pixels_tab[1][0] = avg_pixels8_altivec; 
1407 
c>avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 
1408 
c>put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 
1409 
c>put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 
1410 
c>put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 
1411 
c>put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 
1412  
1413 
c>hadamard8_diff[0] = hadamard8_diff16_altivec;

1414 
c>hadamard8_diff[1] = hadamard8_diff8x8_altivec;

1415 
if (CONFIG_VORBIS_DECODER)

1416 
c>vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; 
1417 
} 