/*


* Copyright (c) 2002 Brian Foley

* Copyright (c) 2002 Dieter Shirley

* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>

*

* This library is free software; you can redistribute it and/or

* modify it under the terms of the GNU Lesser General Public

* License as published by the Free Software Foundation; either

* version 2 of the License, or (at your option) any later version.

*

* This library is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

* Lesser General Public License for more details.

*

* You should have received a copy of the GNU Lesser General Public

* License along with this library; if not, write to the Free Software

* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 021111307 USA

*/

21 
#include "../dsputil.h" 
23 
#include "gcc_fixes.h" 
25 
#include "dsputil_altivec.h" 
27 
#ifdef CONFIG_DARWIN

#include <sys/sysctl.h> 
#else /* CONFIG_DARWIN */ 
#include <signal.h> 
#include <setjmp.h> 
33 
static sigjmp_buf jmpbuf;

static volatile sig_atomic_t canjump = 0; 
36 
static void sigill_handler (int sig) 
{ 
if (!canjump) {

signal (sig, SIG_DFL); 
raise (sig); 
} 
43 
canjump = 0;

siglongjmp (jmpbuf, 1);

} 
#endif /* CONFIG_DARWIN */ 
48 
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
vector unsigned char *tv; 
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
58 
s = 0;

sad = (vector unsigned int)vec_splat_u32(0); 
for(i=0;i<h;i++) { 
/*

Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

*/

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
69 
tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
72 
tv = (vector unsigned char *) &pix2[1]; 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
75 
/* Calculate the average vector */

avgv = vec_avg(pix2v, pix2iv); 
78 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
81 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
84 
pix1 += line_size; 
pix2 += line_size; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

92 
return s;

} 
95 
int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
vector unsigned char *tv; 
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
uint8_t *pix3 = pix2 + line_size; 
106 
s = 0;

sad = (vector unsigned int)vec_splat_u32(0); 
109 
/*

Due to the fact that pix3 = pix2 + line_size, the pix3 of one

iteration becomes pix2 in the next iteration. We can use this

fact to avoid a potentially expensive unaligned read, each

time around the loop.

Read unaligned pixels into our vectors. The vectors are as follows:

pix2v: pix2[0]pix2[15]

Split the pixel vectors into shorts

*/

tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
121 
for(i=0;i<h;i++) { 
/*

Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix3v: pix3[0]pix3[15]

*/

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
130 
tv = (vector unsigned char *) &pix3[0]; 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
133 
/* Calculate the average vector */

avgv = vec_avg(pix2v, pix3v); 
136 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
139 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
142 
pix1 += line_size; 
pix2v = pix3v; 
pix3 += line_size; 
146 
} 
148 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

return s;

} 
155 
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
uint8_t *pix3 = pix2 + line_size; 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2); 
vector unsigned char *tv, avgv, t5; 
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 
vector unsigned short avghv, avglv; 
vector unsigned short t1, t2, t3, t4; 
vector unsigned int sad; 
vector signed int sumdiffs; 
171 
sad = (vector unsigned int)vec_splat_u32(0); 
173 
s = 0;

175 
/*

Due to the fact that pix3 = pix2 + line_size, the pix3 of one

iteration becomes pix2 in the next iteration. We can use this

fact to avoid a potentially expensive unaligned read, as well

as some splitting, and vector addition each time around the loop.

Read unaligned pixels into our vectors. The vectors are as follows:

pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

Split the pixel vectors into shorts

*/

tv = (vector unsigned char *) &pix2[0]; 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
187 
tv = (vector unsigned char *) &pix2[1]; 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
190 
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 
t1 = vec_add(pix2hv, pix2ihv); 
t2 = vec_add(pix2lv, pix2ilv); 
197 
for(i=0;i<h;i++) { 
/*

Read unaligned pixels into our vectors. The vectors are as follows:

pix1v: pix1[0]pix1[15]

pix3v: pix3[0]pix3[15] pix3iv: pix3[1]pix3[16]

*/

tv = (vector unsigned char *) pix1; 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
206 
tv = (vector unsigned char *) &pix3[0]; 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
209 
tv = (vector unsigned char *) &pix3[1]; 
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 
212 
/*

Note that Altivec does have vec_avg, but this works on vector pairs

and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding

would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.

Instead, we have to split the pixel vectors into vectors of shorts,

and do the averaging by hand.

*/

220 
/* Split the pixel vectors into shorts */

pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 
226 
/* Do the averaging on them */

t3 = vec_add(pix3hv, pix3ihv); 
t4 = vec_add(pix3lv, pix3ilv); 
230 
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 
233 
/* Pack the shorts back into a result */

avgv = vec_pack(avghv, avglv); 
236 
/* Calculate a sum of abs differences vector */

t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
239 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
242 
pix1 += line_size; 
pix3 += line_size; 
/* Transfer the calculated values for pix3 into pix2 */

t1 = t3; 
t2 = t4; 
} 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

253 
return s;

} 
256 
int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
266 
sad = (vector unsigned int)vec_splat_u32(0); 
269 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2 */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
278 
/* Calculate a sum of abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
283 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
286 
pix1 += line_size; 
pix2 += line_size; 
} 
290 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

295 
return s;

} 
298 
int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sad; 
vector signed int sumdiffs; 
308 
sad = (vector unsigned int)vec_splat_u32(0); 
310 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
312 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2

Since we're reading 16 pixels, and actually only want 8,

mask out the last 8 pixels. The 0s don't change the sum. */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
323 
/* Calculate a sum of abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
328 
/* Add each 4 pixel group together and put 4 results into sad */

sad = vec_sum4s(t5, sad); 
331 
pix1 += line_size; 
pix2 += line_size; 
} 
335 
/* Sum up the four partial sums, and put the result into s */

sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
sumdiffs = vec_splat(sumdiffs, 3);

vec_ste(sumdiffs, 0, &s);

340 
return s;

} 
343 
int pix_norm1_altivec(uint8_t *pix, int line_size) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
vector unsigned char *tv; 
vector unsigned char pixv; 
vector unsigned int sv; 
vector signed int sum; 
353 
sv = (vector unsigned int)vec_splat_u32(0); 
355 
s = 0;

for (i = 0; i < 16; i++) { 
/* Read in the potentially unaligned pixels */

tv = (vector unsigned char *) pix; 
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 
361 
/* Square the values, and add them to our sum */

sv = vec_msum(pixv, pixv, sv); 
364 
} 
/* Sum up the four partial sums, and put the result into s */

sum = vec_sums((vector signed int) sv, (vector signed int) zero); 
sum = vec_splat(sum, 3);

vec_ste(sum, 0, &s);

371 
return s;

} 
374 
/**

* Sum of Squared Errors for a 8x8 block.

* AltiVecenhanced.

* It's the sad8_altivec code above w/ squaring added.

*/

int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sum; 
vector signed int sumsqr; 
389 
sum = (vector unsigned int)vec_splat_u32(0); 
391 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
394 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2

Since we're reading 16 pixels, and actually only want 8,

mask out the last 8 pixels. The 0s don't change the sum. */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
405 
/*

Since we want to use unsigned chars, we can take advantage

of the fact that abs(ab)^2 = (ab)^2.

*/

410 
/* Calculate abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
415 
/* Square the values and add them to our sum */

sum = vec_msum(t5, t5, sum); 
418 
pix1 += line_size; 
pix2 += line_size; 
} 
422 
/* Sum up the four partial sums, and put the result into s */

sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
sumsqr = vec_splat(sumsqr, 3);

vec_ste(sumsqr, 0, &s);

427 
return s;

} 
430 
/**

* Sum of Squared Errors for a 16x16 block.

* AltiVecenhanced.

* It's the sad16_altivec code above w/ squaring added.

*/

int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
{ 
int i;

int s __attribute__((aligned(16))); 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
vector unsigned char t1, t2, t3,t4, t5; 
vector unsigned int sum; 
vector signed int sumsqr; 
445 
sum = (vector unsigned int)vec_splat_u32(0); 
447 
for(i=0;i<h;i++) { 
/* Read potentially unaligned pixels into t1 and t2 */

perm1 = vec_lvsl(0, pix1);

pix1v = (vector unsigned char *) pix1; 
perm2 = vec_lvsl(0, pix2);

pix2v = (vector unsigned char *) pix2; 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
456 
/*

Since we want to use unsigned chars, we can take advantage

of the fact that abs(ab)^2 = (ab)^2.

*/

461 
/* Calculate abs differences vector */

t3 = vec_max(t1, t2); 
t4 = vec_min(t1, t2); 
t5 = vec_sub(t3, t4); 
466 
/* Square the values and add them to our sum */

sum = vec_msum(t5, t5, sum); 
469 
pix1 += line_size; 
pix2 += line_size; 
} 
473 
/* Sum up the four partial sums, and put the result into s */

sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
sumsqr = vec_splat(sumsqr, 3);

vec_ste(sumsqr, 0, &s);

478 
return s;

} 
481 
int pix_sum_altivec(uint8_t * pix, int line_size) 
{ 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
vector unsigned char perm, *pixv; 
vector unsigned char t1; 
vector unsigned int sad; 
vector signed int sumdiffs; 
489 
int i;

int s __attribute__((aligned(16))); 
492 
sad = (vector unsigned int)vec_splat_u32(0); 
494 
for (i = 0; i < 16; i++) { 
/* Read the potentially unaligned 16 pixels into t1 */

perm = vec_lvsl(0, pix);

pixv = (vector unsigned char *) pix; 
t1 = vec_perm(pixv[0], pixv[1], perm); 
500 
/* Add each 4 pixel group together and put 4 results into sad */

501 
sad = vec_sum4s(t1, sad); 
503 
pix += line_size; 
} 
506 
/* Sum up the four partial sums, and put the result into s */

507 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
508 
sumdiffs = vec_splat(sumdiffs, 3);

509 
vec_ste(sumdiffs, 0, &s);

511 
return s;

512 
} 
514 
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 
{ 
516 
517 
518 
519 
520  
for(i=0;i<8;i++) 
{ 
// Read potentially unaligned pixels.

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, pixels);

pixv = (vector unsigned char *) pixels; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
530 
// convert the bytes into shorts

531 
shorts = (vector signed short)vec_mergeh(zero, bytes); 
533 
534 
535  
pixels += line_size; 
} 
} 
540 
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 
const uint8_t *s2, int stride) 
{ 
int i;

vector unsigned char perm, bytes, *pixv; 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
vector signed short shorts1, shorts2; 
548 
for(i=0;i<4;i++) 
{ 
// Read potentially unaligned pixels

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, s1);

pixv = (vector unsigned char *) s1; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
557 
// convert the bytes into shorts

shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
560 
// Do the same for the second block of pixels

perm = vec_lvsl(0, s2);

pixv = (vector unsigned char *) s2; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
565 
// convert the bytes into shorts

shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
568 
// Do the subtraction

shorts1 = vec_sub(shorts1, shorts2); 
571 
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts1, 0, (vector signed short*)block); 
574 
s1 += stride; 
s2 += stride; 
block += 8;

579 
// The code below is a copy of the code above... This is a manual

// unroll.

582 
// Read potentially unaligned pixels

// We're reading 16 pixels, and actually only want 8,

// but we simply ignore the extras.

perm = vec_lvsl(0, s1);

pixv = (vector unsigned char *) s1; 
bytes = vec_perm(pixv[0], pixv[1], perm); 
589 
// convert the bytes into shorts

shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
592 
593 
594 
595 
596  
// convert the bytes into shorts

shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
600 
// Do the subtraction

shorts1 = vec_sub(shorts1, shorts2); 
603 
// save the data to the block, we assume the block is 16byte aligned

vec_st(shorts1, 0, (vector signed short*)block); 
606 
s1 += stride; 
s2 += stride; 
block += 8;

} 
} 
612 
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

614 
int i;

for(i=0; i+7<w; i++){ 
dst[i+0] += src[i+0]; 
dst[i+1] += src[i+1]; 
dst[i+2] += src[i+2]; 
dst[i+3] += src[i+3]; 
dst[i+4] += src[i+4]; 
dst[i+5] += src[i+5]; 
dst[i+6] += src[i+6]; 
dst[i+7] += src[i+7]; 
} 
for(; i<w; i++)

dst[i+0] += src[i+0]; 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
628 
register int i; 
register vector unsigned char vdst, vsrc; 
631 
/* dst and src are 16 bytesaligned (guaranteed) */

for(i = 0 ; (i + 15) < w ; i++) 
{ 
634 
vdst = vec_ld(i << 4, (unsigned char*)dst); 
vsrc = vec_ld(i << 4, (unsigned char*)src); 
vdst = vec_add(vsrc, vdst); 
vec_st(vdst, i << 4, (unsigned char*)dst); 
} 
/* if w is not a multiple of 16 */

for (; (i < w) ; i++)

{ 
dst[i] = src[i]; 
} 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
} 
647 
/* next one assumes that ((line_size % 16) == 0) */

void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
{ 
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);

#ifdef ALTIVEC_USE_REFERENCE_C_CODE

652 
int i;

654 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);

656 
for(i=0; i<h; i++) { 
*((uint32_t*)(block)) = LD32(pixels); 
*((uint32_t*)(block+4)) = LD32(pixels+4); 
*((uint32_t*)(block+8)) = LD32(pixels+8); 
*((uint32_t*)(block+12)) = LD32(pixels+12); 
pixels+=line_size; 
block +=line_size; 
} 
665 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);

667 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
register vector unsigned char pixelsv1, pixelsv2; 
register vector unsigned char pixelsv1B, pixelsv2B; 
register vector unsigned char pixelsv1C, pixelsv2C; 
register vector unsigned char pixelsv1D, pixelsv2D; 
673 
register vector unsigned char perm = vec_lvsl(0, pixels); 
674 
675 
676 
677 
679 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);

// handunrolling the loop by 4 gains about 15%

// mininum execution time goes from 74 to 60 cycles

// it's faster than funrollloops, but using

// funrollloops w/ this is bad  74 cycles again.

// all this is on a 7450, tuning for the 7450

#if 0

for(i=0; i<h; i++) {

pixelsv1 = vec_ld(0, (unsigned char*)pixels);

pixelsv2 = vec_ld(16, (unsigned char*)pixels);

vec_st(vec_perm(pixelsv1, pixelsv2, perm),

0, (unsigned char*)block);

pixels+=line_size;

block +=line_size;

}

#else

for(i=0; i<h; i+=4) { 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); 
pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); 
pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); 
pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); 
pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); 
pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); 
vec_st(vec_perm(pixelsv1, pixelsv2, perm), 
0, (unsigned char*)block); 
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 
line_size, (unsigned char*)block); 
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 
line_size_2, (unsigned char*)block); 
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 
line_size_3, (unsigned char*)block); 
pixels+=line_size_4; 
block +=line_size_4; 
} 
#endif

POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);

718 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
} 
721 
/* next one assumes that ((line_size % 16) == 0) */

#define op_avg(a,b) a = ( ((a)(b))  ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
{ 
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);

726 
727 
int i;

729 
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);

731 
for(i=0; i<h; i++) { 
op_avg(*((uint32_t*)(block)),LD32(pixels)); 
op_avg(*((uint32_t*)(block+4)),LD32(pixels+4)); 
op_avg(*((uint32_t*)(block+8)),LD32(pixels+8)); 
op_avg(*((uint32_t*)(block+12)),LD32(pixels+12)); 
pixels+=line_size; 
block +=line_size; 
} 
740 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);

742 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
register vector unsigned char perm = vec_lvsl(0, pixels); 
745 
int i;

746  
747 
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);

748  
749 
for(i=0; i<h; i++) { 
750 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
751 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
752 
blockv = vec_ld(0, block);

753 
pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 
754 
blockv = vec_avg(blockv,pixelsv); 
755 
vec_st(blockv, 0, (unsigned char*)block); 
756 
pixels+=line_size; 
757 
block +=line_size; 
758 
} 
759  
760 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);

761  
762 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
763 
} 
764  
765 
/* next one assumes that ((line_size % 8) == 0) */

766 
void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
767 
{ 
768 
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);

769 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

770 
int i;

771 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);

772 
for (i = 0; i < h; i++) { 
773 
*((uint32_t *) (block)) = 
774 
(((*((uint32_t *) (block)))  
775 
((((const struct unaligned_32 *) (pixels))>l)))  
776 
((((*((uint32_t *) (block))) ^ 
777 
((((const struct unaligned_32 *) (pixels))> 
778 
l))) & 0xFEFEFEFEUL) >> 1)); 
779 
*((uint32_t *) (block + 4)) =

780 
(((*((uint32_t *) (block + 4))) 

781 
((((const struct unaligned_32 *) (pixels + 4))>l)))  
782 
((((*((uint32_t *) (block + 4))) ^

783 
((((const struct unaligned_32 *) (pixels + 
784 
4))>

785 
l))) & 0xFEFEFEFEUL) >> 1)); 
786 
pixels += line_size; 
787 
block += line_size; 
788 
} 
789 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);

790  
791 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
792 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
793 
int i;

794  
795 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);

796 

797 
for (i = 0; i < h; i++) { 
798 
/*

799 
block is 8 bytesaligned, so we're either in the

800 
left block (16 bytesaligned) or in the right block (not)

801 
*/

802 
int rightside = ((unsigned long)block & 0x0000000F); 
803 

804 
blockv = vec_ld(0, block);

805 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
806 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
807 
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

808 

809 
if (rightside)

810 
{ 
811 
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 
812 
} 
813 
else

814 
{ 
815 
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 
816 
} 
817 

818 
blockv = vec_avg(blockv, pixelsv); 
819  
820 
vec_st(blockv, 0, block);

821 

822 
pixels += line_size; 
823 
block += line_size; 
824 
} 
825 

826 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);

827 

828 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
829 
} 
830  
831 
/* next one assumes that ((line_size % 8) == 0) */

832 
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
833 
{ 
834 
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);

835 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

836 
int j;

837 
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);

838 
for (j = 0; j < 2; j++) { 
839 
int i;

840 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
841 
const uint32_t b =

842 
(((const struct unaligned_32 *) (pixels + 1))>l); 
843 
uint32_t l0 = 
844 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
845 
uint32_t h0 = 
846 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
847 
uint32_t l1, h1; 
848 
pixels += line_size; 
849 
for (i = 0; i < h; i += 2) { 
850 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
851 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
852 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
853 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
854 
*((uint32_t *) block) = 
855 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
856 
pixels += line_size; 
857 
block += line_size; 
858 
a = (((const struct unaligned_32 *) (pixels))>l); 
859 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
860 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
861 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
862 
*((uint32_t *) block) = 
863 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
864 
pixels += line_size; 
865 
block += line_size; 
866 
} pixels += 4  line_size * (h + 1); 
867 
block += 4  line_size * h;

868 
} 
869  
870 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);

871  
872 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
873 
register int i; 
874 
register vector unsigned char 
875 
pixelsv1, pixelsv2, 
876 
pixelsavg; 
877 
register vector unsigned char 
878 
blockv, temp1, temp2; 
879 
register vector unsigned short 
880 
pixelssum1, pixelssum2, temp3; 
881 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
882 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
883 

884 
temp1 = vec_ld(0, pixels);

885 
temp2 = vec_ld(16, pixels);

886 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

887 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
888 
{ 
889 
pixelsv2 = temp2; 
890 
} 
891 
else

892 
{ 
893 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

894 
} 
895 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
896 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
897 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
898 
(vector unsigned short)pixelsv2); 
899 
pixelssum1 = vec_add(pixelssum1, vctwo); 
900 

901 
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);

902 
for (i = 0; i < h ; i++) { 
903 
int rightside = ((unsigned long)block & 0x0000000F); 
904 
blockv = vec_ld(0, block);

905  
906 
temp1 = vec_ld(line_size, pixels); 
907 
temp2 = vec_ld(line_size + 16, pixels);

908 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
909 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
910 
{ 
911 
pixelsv2 = temp2; 
912 
} 
913 
else

914 
{ 
915 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

916 
} 
917  
918 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
919 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
920 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
921 
(vector unsigned short)pixelsv2); 
922 
temp3 = vec_add(pixelssum1, pixelssum2); 
923 
temp3 = vec_sra(temp3, vctwo); 
924 
pixelssum1 = vec_add(pixelssum2, vctwo); 
925 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
926 

927 
if (rightside)

928 
{ 
929 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
930 
} 
931 
else

932 
{ 
933 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
934 
} 
935 

936 
vec_st(blockv, 0, block);

937 

938 
block += line_size; 
939 
pixels += line_size; 
940 
} 
941 

942 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);

943 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
944 
} 
945  
946 
/* next one assumes that ((line_size % 8) == 0) */

947 
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
948 
{ 
949 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);

950 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

951 
int j;

952 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

953 
for (j = 0; j < 2; j++) { 
954 
int i;

955 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
956 
const uint32_t b =

957 
(((const struct unaligned_32 *) (pixels + 1))>l); 
958 
uint32_t l0 = 
959 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
960 
uint32_t h0 = 
961 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
962 
uint32_t l1, h1; 
963 
pixels += line_size; 
964 
for (i = 0; i < h; i += 2) { 
965 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
966 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
967 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
968 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
969 
*((uint32_t *) block) = 
970 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
971 
pixels += line_size; 
972 
block += line_size; 
973 
a = (((const struct unaligned_32 *) (pixels))>l); 
974 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
975 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
976 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
977 
*((uint32_t *) block) = 
978 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
979 
pixels += line_size; 
980 
block += line_size; 
981 
} pixels += 4  line_size * (h + 1); 
982 
block += 4  line_size * h;

983 
} 
984 

985 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

986  
987 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
988 
register int i; 
989 
register vector unsigned char 
990 
pixelsv1, pixelsv2, 
991 
pixelsavg; 
992 
register vector unsigned char 
993 
blockv, temp1, temp2; 
994 
register vector unsigned short 
995 
pixelssum1, pixelssum2, temp3; 
996 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
997 
register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); 
998 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
999 

1000 
temp1 = vec_ld(0, pixels);

1001 
temp2 = vec_ld(16, pixels);

1002 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1003 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
1004 
{ 
1005 
pixelsv2 = temp2; 
1006 
} 
1007 
else

1008 
{ 
1009 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1010 
} 
1011 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1012 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1013 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1014 
(vector unsigned short)pixelsv2); 
1015 
pixelssum1 = vec_add(pixelssum1, vcone); 
1016 

1017 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

1018 
for (i = 0; i < h ; i++) { 
1019 
int rightside = ((unsigned long)block & 0x0000000F); 
1020 
blockv = vec_ld(0, block);

1021  
1022 
temp1 = vec_ld(line_size, pixels); 
1023 
temp2 = vec_ld(line_size + 16, pixels);

1024 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1025 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1026 
{ 
1027 
pixelsv2 = temp2; 
1028 
} 
1029 
else

1030 
{ 
1031 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1032 
} 
1033  
1034 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1035 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1036 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1037 
(vector unsigned short)pixelsv2); 
1038 
temp3 = vec_add(pixelssum1, pixelssum2); 
1039 
temp3 = vec_sra(temp3, vctwo); 
1040 
pixelssum1 = vec_add(pixelssum2, vcone); 
1041 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
1042 

1043 
if (rightside)

1044 
{ 
1045 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
1046 
} 
1047 
else

1048 
{ 
1049 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
1050 
} 
1051 

1052 
vec_st(blockv, 0, block);

1053 

1054 
block += line_size; 
1055 
pixels += line_size; 
1056 
} 
1057 

1058 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

1059 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1060 
} 
1061  
1062 
/* next one assumes that ((line_size % 16) == 0) */

1063 
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
1064 
{ 
1065 
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);

1066 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

1067 
int j;

1068 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);

1069 
for (j = 0; j < 4; j++) { 
1070 
int i;

1071 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1072 
const uint32_t b =

1073 
(((const struct unaligned_32 *) (pixels + 1))>l); 
1074 
uint32_t l0 = 
1075 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
1076 
uint32_t h0 = 
1077 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1078 
uint32_t l1, h1; 
1079 
pixels += line_size; 
1080 
for (i = 0; i < h; i += 2) { 
1081 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1082 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1083 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
1084 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1085 
*((uint32_t *) block) = 
1086 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1087 
pixels += line_size; 
1088 
block += line_size; 
1089 
a = (((const struct unaligned_32 *) (pixels))>l); 
1090 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1091 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
1092 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1093 
*((uint32_t *) block) = 
1094 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1095 
pixels += line_size; 
1096 
block += line_size; 
1097 
} pixels += 4  line_size * (h + 1); 
1098 
block += 4  line_size * h;

1099 
} 
1100  
1101 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);

1102  
1103 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1104 
register int i; 
1105 
register vector unsigned char 
1106 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
1107 
register vector unsigned char 
1108 
blockv, temp1, temp2; 
1109 
register vector unsigned short 
1110 
pixelssum1, pixelssum2, temp3, 
1111 
pixelssum3, pixelssum4, temp4; 
1112 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
1113 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
1114  
1115 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);

1116 

1117 
temp1 = vec_ld(0, pixels);

1118 
temp2 = vec_ld(16, pixels);

1119 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1120 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
1121 
{ 
1122 
pixelsv2 = temp2; 
1123 
} 
1124 
else

1125 
{ 
1126 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1127 
} 
1128 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1129 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1130 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1131 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1132 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
1133 
(vector unsigned short)pixelsv4); 
1134 
pixelssum3 = vec_add(pixelssum3, vctwo); 
1135 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1136 
(vector unsigned short)pixelsv2); 
1137 
pixelssum1 = vec_add(pixelssum1, vctwo); 
1138 

1139 
for (i = 0; i < h ; i++) { 
1140 
blockv = vec_ld(0, block);

1141  
1142 
temp1 = vec_ld(line_size, pixels); 
1143 
temp2 = vec_ld(line_size + 16, pixels);

1144 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1145 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1146 
{ 
1147 
pixelsv2 = temp2; 
1148 
} 
1149 
else

1150 
{ 
1151 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1152 
} 
1153  
1154 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1155 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1156 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1157 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1158 

1159 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
1160 
(vector unsigned short)pixelsv4); 
1161 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1162 
(vector unsigned short)pixelsv2); 
1163 
temp4 = vec_add(pixelssum3, pixelssum4); 
1164 
temp4 = vec_sra(temp4, vctwo); 
1165 
temp3 = vec_add(pixelssum1, pixelssum2); 
1166 
temp3 = vec_sra(temp3, vctwo); 
1167  
1168 
pixelssum3 = vec_add(pixelssum4, vctwo); 
1169 
pixelssum1 = vec_add(pixelssum2, vctwo); 
1170  
1171 
blockv = vec_packsu(temp3, temp4); 
1172 

1173 
vec_st(blockv, 0, block);

1174 

1175 
block += line_size; 
1176 
pixels += line_size; 
1177 
} 
1178 

1179 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);

1180 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1181 
} 
1182  
1183 
/* next one assumes that ((line_size % 16) == 0) */

1184 
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
1185 
{ 
1186 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);

1187 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

1188 
int j;

1189 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1190 
for (j = 0; j < 4; j++) { 
1191 
int i;

1192 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1193 
const uint32_t b =

1194 
(((const struct unaligned_32 *) (pixels + 1))>l); 
1195 
uint32_t l0 = 
1196 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
1197 
uint32_t h0 = 
1198 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1199 
uint32_t l1, h1; 
1200 
pixels += line_size; 
1201 
for (i = 0; i < h; i += 2) { 
1202 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1203 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1204 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
1205 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1206 
*((uint32_t *) block) = 
1207 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1208 
pixels += line_size; 
1209 
block += line_size; 
1210 
a = (((const struct unaligned_32 *) (pixels))>l); 
1211 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1212 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
1213 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1214 
*((uint32_t *) block) = 
1215 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1216 
pixels += line_size; 
1217 
block += line_size; 
1218 
} pixels += 4  line_size * (h + 1); 
1219 
block += 4  line_size * h;

1220 
} 
1221  
1222 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1223  
1224 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1225 
register int i; 
1226 
register vector unsigned char 
1227 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
1228 
register vector unsigned char 
1229 
blockv, temp1, temp2; 
1230 
register vector unsigned short 
1231 
pixelssum1, pixelssum2, temp3, 
1232 
pixelssum3, pixelssum4, temp4; 
1233 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
1234 
register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); 
1235 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
1236  
1237 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1238 

1239 
temp1 = vec_ld(0, pixels);

1240 
temp2 = vec_ld(16, pixels);

1241 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1242 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
1243 
{ 
1244 
pixelsv2 = temp2; 
1245 
} 
1246 
else

1247 
{ 
1248 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1249 
} 
1250 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1251 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1252 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1253 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1254 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
1255 
(vector unsigned short)pixelsv4); 
1256 
pixelssum3 = vec_add(pixelssum3, vcone); 
1257 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1258 
(vector unsigned short)pixelsv2); 
1259 
pixelssum1 = vec_add(pixelssum1, vcone); 
1260 

1261 
for (i = 0; i < h ; i++) { 
1262 
blockv = vec_ld(0, block);

1263  
1264 
temp1 = vec_ld(line_size, pixels); 
1265 
temp2 = vec_ld(line_size + 16, pixels);

1266 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1267 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1268 
{ 
1269 
pixelsv2 = temp2; 
1270 
} 
1271 
else

1272 
{ 
1273 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1274 
} 
1275  
1276 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1277 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1278 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1279 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1280 

1281 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
1282 
(vector unsigned short)pixelsv4); 
1283 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1284 
(vector unsigned short)pixelsv2); 
1285 
temp4 = vec_add(pixelssum3, pixelssum4); 
1286 
temp4 = vec_sra(temp4, vctwo); 
1287 
temp3 = vec_add(pixelssum1, pixelssum2); 
1288 
temp3 = vec_sra(temp3, vctwo); 
1289  
1290 
pixelssum3 = vec_add(pixelssum4, vcone); 
1291 
pixelssum1 = vec_add(pixelssum2, vcone); 
1292  
1293 
blockv = vec_packsu(temp3, temp4); 
1294 

1295 
vec_st(blockv, 0, block);

1296 

1297 
block += line_size; 
1298 
pixels += line_size; 
1299 
} 
1300 

1301 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1302 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1303 
} 
1304  
1305 
int has_altivec(void) 
1306 
{ 
1307 
#ifdef CONFIG_DARWIN

1308 
int sels[2] = {CTL_HW, HW_VECTORUNIT}; 
1309 
int has_vu = 0; 
1310 
size_t len = sizeof(has_vu);

1311 
int err;

1312  
1313 
err = sysctl(sels, 2, &has_vu, &len, NULL, 0); 
1314  
1315 
if (err == 0) return (has_vu != 0); 
1316 
#else /* CONFIG_DARWIN */ 
1317 
/* no Darwin, do it the bruteforce way */

1318 
/* this is borrowed from the libmpeg2 library */

1319 
{ 
1320 
signal (SIGILL, sigill_handler); 
1321 
if (sigsetjmp (jmpbuf, 1)) { 
1322 
signal (SIGILL, SIG_DFL); 
1323 
} else {

1324 
canjump = 1;

1325 

1326 
asm volatile ("mtspr 256, %0\n\t" 
1327 
"vand %%v0, %%v0, %%v0"

1328 
: 
1329 
: "r" (1)); 
1330 

1331 
signal (SIGILL, SIG_DFL); 
1332 
return 1; 
1333 
} 
1334 
} 
1335 
#endif /* CONFIG_DARWIN */ 
1336 
return 0; 
1337 
} 