ffmpeg / libavcodec / ppc / dsputil_altivec.c @ aab34ca0
History  View  Annotate  Download (44.6 KB)
1 
/*


2 
* Copyright (c) 2002 Brian Foley

3 
* Copyright (c) 2002 Dieter Shirley

4 
* Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>

5 
*

6 
* This library is free software; you can redistribute it and/or

7 
* modify it under the terms of the GNU Lesser General Public

8 
* License as published by the Free Software Foundation; either

9 
* version 2 of the License, or (at your option) any later version.

10 
*

11 
* This library is distributed in the hope that it will be useful,

12 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

13 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14 
* Lesser General Public License for more details.

15 
*

16 
* You should have received a copy of the GNU Lesser General Public

17 
* License along with this library; if not, write to the Free Software

18 
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 021111307 USA

19 
*/

20 

21 
#include "../dsputil.h" 
22  
23 
#include "gcc_fixes.h" 
24  
25 
#include "dsputil_altivec.h" 
26  
27 
#ifdef CONFIG_DARWIN

28 
#include <sys/sysctl.h> 
29 
#else /* CONFIG_DARWIN */ 
30 
#include <signal.h> 
31 
#include <setjmp.h> 
32  
33 
static sigjmp_buf jmpbuf;

34 
static volatile sig_atomic_t canjump = 0; 
35  
36 
static void sigill_handler (int sig) 
37 
{ 
38 
if (!canjump) {

39 
signal (sig, SIG_DFL); 
40 
raise (sig); 
41 
} 
42 

43 
canjump = 0;

44 
siglongjmp (jmpbuf, 1);

45 
} 
46 
#endif /* CONFIG_DARWIN */ 
47  
48 
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
49 
{ 
50 
int i;

51 
int s __attribute__((aligned(16))); 
52 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
53 
vector unsigned char *tv; 
54 
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 
55 
vector unsigned int sad; 
56 
vector signed int sumdiffs; 
57  
58 
s = 0;

59 
sad = (vector unsigned int)vec_splat_u32(0); 
60 
for(i=0;i<h;i++) { 
61 
/*

62 
Read unaligned pixels into our vectors. The vectors are as follows:

63 
pix1v: pix1[0]pix1[15]

64 
pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

65 
*/

66 
tv = (vector unsigned char *) pix1; 
67 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
68 

69 
tv = (vector unsigned char *) &pix2[0]; 
70 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
71  
72 
tv = (vector unsigned char *) &pix2[1]; 
73 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
74  
75 
/* Calculate the average vector */

76 
avgv = vec_avg(pix2v, pix2iv); 
77  
78 
/* Calculate a sum of abs differences vector */

79 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
80  
81 
/* Add each 4 pixel group together and put 4 results into sad */

82 
sad = vec_sum4s(t5, sad); 
83 

84 
pix1 += line_size; 
85 
pix2 += line_size; 
86 
} 
87 
/* Sum up the four partial sums, and put the result into s */

88 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
89 
sumdiffs = vec_splat(sumdiffs, 3);

90 
vec_ste(sumdiffs, 0, &s);

91  
92 
return s;

93 
} 
94  
95 
int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
96 
{ 
97 
int i;

98 
int s __attribute__((aligned(16))); 
99 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
100 
vector unsigned char *tv; 
101 
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 
102 
vector unsigned int sad; 
103 
vector signed int sumdiffs; 
104 
uint8_t *pix3 = pix2 + line_size; 
105  
106 
s = 0;

107 
sad = (vector unsigned int)vec_splat_u32(0); 
108  
109 
/*

110 
Due to the fact that pix3 = pix2 + line_size, the pix3 of one

111 
iteration becomes pix2 in the next iteration. We can use this

112 
fact to avoid a potentially expensive unaligned read, each

113 
time around the loop.

114 
Read unaligned pixels into our vectors. The vectors are as follows:

115 
pix2v: pix2[0]pix2[15]

116 
Split the pixel vectors into shorts

117 
*/

118 
tv = (vector unsigned char *) &pix2[0]; 
119 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
120 

121 
for(i=0;i<h;i++) { 
122 
/*

123 
Read unaligned pixels into our vectors. The vectors are as follows:

124 
pix1v: pix1[0]pix1[15]

125 
pix3v: pix3[0]pix3[15]

126 
*/

127 
tv = (vector unsigned char *) pix1; 
128 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
129  
130 
tv = (vector unsigned char *) &pix3[0]; 
131 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
132  
133 
/* Calculate the average vector */

134 
avgv = vec_avg(pix2v, pix3v); 
135  
136 
/* Calculate a sum of abs differences vector */

137 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
138  
139 
/* Add each 4 pixel group together and put 4 results into sad */

140 
sad = vec_sum4s(t5, sad); 
141 

142 
pix1 += line_size; 
143 
pix2v = pix3v; 
144 
pix3 += line_size; 
145 

146 
} 
147 

148 
/* Sum up the four partial sums, and put the result into s */

149 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
150 
sumdiffs = vec_splat(sumdiffs, 3);

151 
vec_ste(sumdiffs, 0, &s);

152 
return s;

153 
} 
154  
155 
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
156 
{ 
157 
int i;

158 
int s __attribute__((aligned(16))); 
159 
uint8_t *pix3 = pix2 + line_size; 
160 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
161 
const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2); 
162 
vector unsigned char *tv, avgv, t5; 
163 
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 
164 
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 
165 
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 
166 
vector unsigned short avghv, avglv; 
167 
vector unsigned short t1, t2, t3, t4; 
168 
vector unsigned int sad; 
169 
vector signed int sumdiffs; 
170  
171 
sad = (vector unsigned int)vec_splat_u32(0); 
172 

173 
s = 0;

174  
175 
/*

176 
Due to the fact that pix3 = pix2 + line_size, the pix3 of one

177 
iteration becomes pix2 in the next iteration. We can use this

178 
fact to avoid a potentially expensive unaligned read, as well

179 
as some splitting, and vector addition each time around the loop.

180 
Read unaligned pixels into our vectors. The vectors are as follows:

181 
pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

182 
Split the pixel vectors into shorts

183 
*/

184 
tv = (vector unsigned char *) &pix2[0]; 
185 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
186  
187 
tv = (vector unsigned char *) &pix2[1]; 
188 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
189  
190 
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 
191 
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 
192 
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 
193 
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 
194 
t1 = vec_add(pix2hv, pix2ihv); 
195 
t2 = vec_add(pix2lv, pix2ilv); 
196 

197 
for(i=0;i<h;i++) { 
198 
/*

199 
Read unaligned pixels into our vectors. The vectors are as follows:

200 
pix1v: pix1[0]pix1[15]

201 
pix3v: pix3[0]pix3[15] pix3iv: pix3[1]pix3[16]

202 
*/

203 
tv = (vector unsigned char *) pix1; 
204 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
205  
206 
tv = (vector unsigned char *) &pix3[0]; 
207 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
208  
209 
tv = (vector unsigned char *) &pix3[1]; 
210 
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 
211  
212 
/*

213 
Note that Altivec does have vec_avg, but this works on vector pairs

214 
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding

215 
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.

216 
Instead, we have to split the pixel vectors into vectors of shorts,

217 
and do the averaging by hand.

218 
*/

219  
220 
/* Split the pixel vectors into shorts */

221 
pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 
222 
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 
223 
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 
224 
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 
225  
226 
/* Do the averaging on them */

227 
t3 = vec_add(pix3hv, pix3ihv); 
228 
t4 = vec_add(pix3lv, pix3ilv); 
229  
230 
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 
231 
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 
232  
233 
/* Pack the shorts back into a result */

234 
avgv = vec_pack(avghv, avglv); 
235  
236 
/* Calculate a sum of abs differences vector */

237 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
238  
239 
/* Add each 4 pixel group together and put 4 results into sad */

240 
sad = vec_sum4s(t5, sad); 
241  
242 
pix1 += line_size; 
243 
pix3 += line_size; 
244 
/* Transfer the calculated values for pix3 into pix2 */

245 
t1 = t3; 
246 
t2 = t4; 
247 
} 
248 
/* Sum up the four partial sums, and put the result into s */

249 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
250 
sumdiffs = vec_splat(sumdiffs, 3);

251 
vec_ste(sumdiffs, 0, &s);

252  
253 
return s;

254 
} 
255  
256 
int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
257 
{ 
258 
int i;

259 
int s __attribute__((aligned(16))); 
260 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
261 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
262 
vector unsigned char t1, t2, t3,t4, t5; 
263 
vector unsigned int sad; 
264 
vector signed int sumdiffs; 
265 

266 
sad = (vector unsigned int)vec_splat_u32(0); 
267  
268  
269 
for(i=0;i<h;i++) { 
270 
/* Read potentially unaligned pixels into t1 and t2 */

271 
perm1 = vec_lvsl(0, pix1);

272 
pix1v = (vector unsigned char *) pix1; 
273 
perm2 = vec_lvsl(0, pix2);

274 
pix2v = (vector unsigned char *) pix2; 
275 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
276 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
277 

278 
/* Calculate a sum of abs differences vector */

279 
t3 = vec_max(t1, t2); 
280 
t4 = vec_min(t1, t2); 
281 
t5 = vec_sub(t3, t4); 
282 

283 
/* Add each 4 pixel group together and put 4 results into sad */

284 
sad = vec_sum4s(t5, sad); 
285  
286 
pix1 += line_size; 
287 
pix2 += line_size; 
288 
} 
289  
290 
/* Sum up the four partial sums, and put the result into s */

291 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
292 
sumdiffs = vec_splat(sumdiffs, 3);

293 
vec_ste(sumdiffs, 0, &s);

294 

295 
return s;

296 
} 
297  
298 
int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
299 
{ 
300 
int i;

301 
int s __attribute__((aligned(16))); 
302 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
303 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
304 
vector unsigned char t1, t2, t3,t4, t5; 
305 
vector unsigned int sad; 
306 
vector signed int sumdiffs; 
307  
308 
sad = (vector unsigned int)vec_splat_u32(0); 
309  
310 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
311  
312 
for(i=0;i<h;i++) { 
313 
/* Read potentially unaligned pixels into t1 and t2

314 
Since we're reading 16 pixels, and actually only want 8,

315 
mask out the last 8 pixels. The 0s don't change the sum. */

316 
perm1 = vec_lvsl(0, pix1);

317 
pix1v = (vector unsigned char *) pix1; 
318 
perm2 = vec_lvsl(0, pix2);

319 
pix2v = (vector unsigned char *) pix2; 
320 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
321 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
322  
323 
/* Calculate a sum of abs differences vector */

324 
t3 = vec_max(t1, t2); 
325 
t4 = vec_min(t1, t2); 
326 
t5 = vec_sub(t3, t4); 
327  
328 
/* Add each 4 pixel group together and put 4 results into sad */

329 
sad = vec_sum4s(t5, sad); 
330  
331 
pix1 += line_size; 
332 
pix2 += line_size; 
333 
} 
334  
335 
/* Sum up the four partial sums, and put the result into s */

336 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
337 
sumdiffs = vec_splat(sumdiffs, 3);

338 
vec_ste(sumdiffs, 0, &s);

339  
340 
return s;

341 
} 
342  
343 
int pix_norm1_altivec(uint8_t *pix, int line_size) 
344 
{ 
345 
int i;

346 
int s __attribute__((aligned(16))); 
347 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
348 
vector unsigned char *tv; 
349 
vector unsigned char pixv; 
350 
vector unsigned int sv; 
351 
vector signed int sum; 
352 

353 
sv = (vector unsigned int)vec_splat_u32(0); 
354 

355 
s = 0;

356 
for (i = 0; i < 16; i++) { 
357 
/* Read in the potentially unaligned pixels */

358 
tv = (vector unsigned char *) pix; 
359 
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 
360  
361 
/* Square the values, and add them to our sum */

362 
sv = vec_msum(pixv, pixv, sv); 
363  
364 
pix += line_size; 
365 
} 
366 
/* Sum up the four partial sums, and put the result into s */

367 
sum = vec_sums((vector signed int) sv, (vector signed int) zero); 
368 
sum = vec_splat(sum, 3);

369 
vec_ste(sum, 0, &s);

370  
371 
return s;

372 
} 
373  
374 
/**

375 
* Sum of Squared Errors for a 8x8 block.

376 
* AltiVecenhanced.

377 
* It's the sad8_altivec code above w/ squaring added.

378 
*/

379 
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
380 
{ 
381 
int i;

382 
int s __attribute__((aligned(16))); 
383 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
384 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
385 
vector unsigned char t1, t2, t3,t4, t5; 
386 
vector unsigned int sum; 
387 
vector signed int sumsqr; 
388 

389 
sum = (vector unsigned int)vec_splat_u32(0); 
390  
391 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
392  
393 

394 
for(i=0;i<h;i++) { 
395 
/* Read potentially unaligned pixels into t1 and t2

396 
Since we're reading 16 pixels, and actually only want 8,

397 
mask out the last 8 pixels. The 0s don't change the sum. */

398 
perm1 = vec_lvsl(0, pix1);

399 
pix1v = (vector unsigned char *) pix1; 
400 
perm2 = vec_lvsl(0, pix2);

401 
pix2v = (vector unsigned char *) pix2; 
402 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
403 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
404  
405 
/*

406 
Since we want to use unsigned chars, we can take advantage

407 
of the fact that abs(ab)^2 = (ab)^2.

408 
*/

409 

410 
/* Calculate abs differences vector */

411 
t3 = vec_max(t1, t2); 
412 
t4 = vec_min(t1, t2); 
413 
t5 = vec_sub(t3, t4); 
414 

415 
/* Square the values and add them to our sum */

416 
sum = vec_msum(t5, t5, sum); 
417 

418 
pix1 += line_size; 
419 
pix2 += line_size; 
420 
} 
421 

422 
/* Sum up the four partial sums, and put the result into s */

423 
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
424 
sumsqr = vec_splat(sumsqr, 3);

425 
vec_ste(sumsqr, 0, &s);

426 

427 
return s;

428 
} 
429  
430 
/**

431 
* Sum of Squared Errors for a 16x16 block.

432 
* AltiVecenhanced.

433 
* It's the sad16_altivec code above w/ squaring added.

434 
*/

435 
int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
436 
{ 
437 
int i;

438 
int s __attribute__((aligned(16))); 
439 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
440 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
441 
vector unsigned char t1, t2, t3,t4, t5; 
442 
vector unsigned int sum; 
443 
vector signed int sumsqr; 
444 

445 
sum = (vector unsigned int)vec_splat_u32(0); 
446 

447 
for(i=0;i<h;i++) { 
448 
/* Read potentially unaligned pixels into t1 and t2 */

449 
perm1 = vec_lvsl(0, pix1);

450 
pix1v = (vector unsigned char *) pix1; 
451 
perm2 = vec_lvsl(0, pix2);

452 
pix2v = (vector unsigned char *) pix2; 
453 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
454 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
455  
456 
/*

457 
Since we want to use unsigned chars, we can take advantage

458 
of the fact that abs(ab)^2 = (ab)^2.

459 
*/

460 

461 
/* Calculate abs differences vector */

462 
t3 = vec_max(t1, t2); 
463 
t4 = vec_min(t1, t2); 
464 
t5 = vec_sub(t3, t4); 
465 

466 
/* Square the values and add them to our sum */

467 
sum = vec_msum(t5, t5, sum); 
468 

469 
pix1 += line_size; 
470 
pix2 += line_size; 
471 
} 
472 

473 
/* Sum up the four partial sums, and put the result into s */

474 
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
475 
sumsqr = vec_splat(sumsqr, 3);

476 
vec_ste(sumsqr, 0, &s);

477 

478 
return s;

479 
} 
480  
481 
int pix_sum_altivec(uint8_t * pix, int line_size) 
482 
{ 
483 
const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); 
484 
vector unsigned char perm, *pixv; 
485 
vector unsigned char t1; 
486 
vector unsigned int sad; 
487 
vector signed int sumdiffs; 
488  
489 
int i;

490 
int s __attribute__((aligned(16))); 
491 

492 
sad = (vector unsigned int)vec_splat_u32(0); 
493 

494 
for (i = 0; i < 16; i++) { 
495 
/* Read the potentially unaligned 16 pixels into t1 */

496 
perm = vec_lvsl(0, pix);

497 
pixv = (vector unsigned char *) pix; 
498 
t1 = vec_perm(pixv[0], pixv[1], perm); 
499  
500 
/* Add each 4 pixel group together and put 4 results into sad */

501 
sad = vec_sum4s(t1, sad); 
502 

503 
pix += line_size; 
504 
} 
505 

506 
/* Sum up the four partial sums, and put the result into s */

507 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
508 
sumdiffs = vec_splat(sumdiffs, 3);

509 
vec_ste(sumdiffs, 0, &s);

510 

511 
return s;

512 
} 
513  
514 
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 
515 
{ 
516 
int i;

517 
vector unsigned char perm, bytes, *pixv; 
518 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
519 
vector signed short shorts; 
520  
521 
for(i=0;i<8;i++) 
522 
{ 
523 
// Read potentially unaligned pixels.

524 
// We're reading 16 pixels, and actually only want 8,

525 
// but we simply ignore the extras.

526 
perm = vec_lvsl(0, pixels);

527 
pixv = (vector unsigned char *) pixels; 
528 
bytes = vec_perm(pixv[0], pixv[1], perm); 
529  
530 
// convert the bytes into shorts

531 
shorts = (vector signed short)vec_mergeh(zero, bytes); 
532  
533 
// save the data to the block, we assume the block is 16byte aligned

534 
vec_st(shorts, i*16, (vector signed short*)block); 
535  
536 
pixels += line_size; 
537 
} 
538 
} 
539  
540 
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 
541 
const uint8_t *s2, int stride) 
542 
{ 
543 
int i;

544 
vector unsigned char perm, bytes, *pixv; 
545 
const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); 
546 
vector signed short shorts1, shorts2; 
547  
548 
for(i=0;i<4;i++) 
549 
{ 
550 
// Read potentially unaligned pixels

551 
// We're reading 16 pixels, and actually only want 8,

552 
// but we simply ignore the extras.

553 
perm = vec_lvsl(0, s1);

554 
pixv = (vector unsigned char *) s1; 
555 
bytes = vec_perm(pixv[0], pixv[1], perm); 
556  
557 
// convert the bytes into shorts

558 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
559  
560 
// Do the same for the second block of pixels

561 
perm = vec_lvsl(0, s2);

562 
pixv = (vector unsigned char *) s2; 
563 
bytes = vec_perm(pixv[0], pixv[1], perm); 
564  
565 
// convert the bytes into shorts

566 
shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
567  
568 
// Do the subtraction

569 
shorts1 = vec_sub(shorts1, shorts2); 
570  
571 
// save the data to the block, we assume the block is 16byte aligned

572 
vec_st(shorts1, 0, (vector signed short*)block); 
573  
574 
s1 += stride; 
575 
s2 += stride; 
576 
block += 8;

577  
578  
579 
// The code below is a copy of the code above... This is a manual

580 
// unroll.

581  
582 
// Read potentially unaligned pixels

583 
// We're reading 16 pixels, and actually only want 8,

584 
// but we simply ignore the extras.

585 
perm = vec_lvsl(0, s1);

586 
pixv = (vector unsigned char *) s1; 
587 
bytes = vec_perm(pixv[0], pixv[1], perm); 
588  
589 
// convert the bytes into shorts

590 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
591  
592 
// Do the same for the second block of pixels

593 
perm = vec_lvsl(0, s2);

594 
pixv = (vector unsigned char *) s2; 
595 
bytes = vec_perm(pixv[0], pixv[1], perm); 
596  
597 
// convert the bytes into shorts

598 
shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
599  
600 
// Do the subtraction

601 
shorts1 = vec_sub(shorts1, shorts2); 
602  
603 
// save the data to the block, we assume the block is 16byte aligned

604 
vec_st(shorts1, 0, (vector signed short*)block); 
605  
606 
s1 += stride; 
607 
s2 += stride; 
608 
block += 8;

609 
} 
610 
} 
611  
612 
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 
613 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

614 
int i;

615 
for(i=0; i+7<w; i++){ 
616 
dst[i+0] += src[i+0]; 
617 
dst[i+1] += src[i+1]; 
618 
dst[i+2] += src[i+2]; 
619 
dst[i+3] += src[i+3]; 
620 
dst[i+4] += src[i+4]; 
621 
dst[i+5] += src[i+5]; 
622 
dst[i+6] += src[i+6]; 
623 
dst[i+7] += src[i+7]; 
624 
} 
625 
for(; i<w; i++)

626 
dst[i+0] += src[i+0]; 
627 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
628 
register int i; 
629 
register vector unsigned char vdst, vsrc; 
630 

631 
/* dst and src are 16 bytesaligned (guaranteed) */

632 
for(i = 0 ; (i + 15) < w ; i++) 
633 
{ 
634 
vdst = vec_ld(i << 4, (unsigned char*)dst); 
635 
vsrc = vec_ld(i << 4, (unsigned char*)src); 
636 
vdst = vec_add(vsrc, vdst); 
637 
vec_st(vdst, i << 4, (unsigned char*)dst); 
638 
} 
639 
/* if w is not a multiple of 16 */

640 
for (; (i < w) ; i++)

641 
{ 
642 
dst[i] = src[i]; 
643 
} 
644 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
645 
} 
646  
647 
/* next one assumes that ((line_size % 16) == 0) */

648 
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
649 
{ 
650 
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);

651 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

652 
int i;

653  
654 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);

655  
656 
for(i=0; i<h; i++) { 
657 
*((uint32_t*)(block)) = LD32(pixels); 
658 
*((uint32_t*)(block+4)) = LD32(pixels+4); 
659 
*((uint32_t*)(block+8)) = LD32(pixels+8); 
660 
*((uint32_t*)(block+12)) = LD32(pixels+12); 
661 
pixels+=line_size; 
662 
block +=line_size; 
663 
} 
664  
665 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);

666  
667 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
668 
register vector unsigned char pixelsv1, pixelsv2; 
669 
register vector unsigned char pixelsv1B, pixelsv2B; 
670 
register vector unsigned char pixelsv1C, pixelsv2C; 
671 
register vector unsigned char pixelsv1D, pixelsv2D; 
672  
673 
register vector unsigned char perm = vec_lvsl(0, pixels); 
674 
int i;

675 
register int line_size_2 = line_size << 1; 
676 
register int line_size_3 = line_size + line_size_2; 
677 
register int line_size_4 = line_size << 2; 
678  
679 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);

680 
// handunrolling the loop by 4 gains about 15%

681 
// mininum execution time goes from 74 to 60 cycles

682 
// it's faster than funrollloops, but using

683 
// funrollloops w/ this is bad  74 cycles again.

684 
// all this is on a 7450, tuning for the 7450

685 
#if 0

686 
for(i=0; i<h; i++) {

687 
pixelsv1 = vec_ld(0, (unsigned char*)pixels);

688 
pixelsv2 = vec_ld(16, (unsigned char*)pixels);

689 
vec_st(vec_perm(pixelsv1, pixelsv2, perm),

690 
0, (unsigned char*)block);

691 
pixels+=line_size;

692 
block +=line_size;

693 
}

694 
#else

695 
for(i=0; i<h; i+=4) { 
696 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
697 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
698 
pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); 
699 
pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); 
700 
pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); 
701 
pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); 
702 
pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); 
703 
pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); 
704 
vec_st(vec_perm(pixelsv1, pixelsv2, perm), 
705 
0, (unsigned char*)block); 
706 
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 
707 
line_size, (unsigned char*)block); 
708 
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 
709 
line_size_2, (unsigned char*)block); 
710 
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 
711 
line_size_3, (unsigned char*)block); 
712 
pixels+=line_size_4; 
713 
block +=line_size_4; 
714 
} 
715 
#endif

716 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);

717  
718 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
719 
} 
720  
721 
/* next one assumes that ((line_size % 16) == 0) */

722 
#define op_avg(a,b) a = ( ((a)(b))  ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 
723 
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
724 
{ 
725 
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);

726 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

727 
int i;

728  
729 
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);

730  
731 
for(i=0; i<h; i++) { 
732 
op_avg(*((uint32_t*)(block)),LD32(pixels)); 
733 
op_avg(*((uint32_t*)(block+4)),LD32(pixels+4)); 
734 
op_avg(*((uint32_t*)(block+8)),LD32(pixels+8)); 
735 
op_avg(*((uint32_t*)(block+12)),LD32(pixels+12)); 
736 
pixels+=line_size; 
737 
block +=line_size; 
738 
} 
739  
740 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);

741  
742 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
743 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
744 
register vector unsigned char perm = vec_lvsl(0, pixels); 
745 
int i;

746  
747 
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);

748  
749 
for(i=0; i<h; i++) { 
750 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
751 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
752 
blockv = vec_ld(0, block);

753 
pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 
754 
blockv = vec_avg(blockv,pixelsv); 
755 
vec_st(blockv, 0, (unsigned char*)block); 
756 
pixels+=line_size; 
757 
block +=line_size; 
758 
} 
759  
760 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);

761  
762 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
763 
} 
764  
765 
/* next one assumes that ((line_size % 8) == 0) */

766 
void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
767 
{ 
768 
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);

769 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

770 
int i;

771 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);

772 
for (i = 0; i < h; i++) { 
773 
*((uint32_t *) (block)) = 
774 
(((*((uint32_t *) (block)))  
775 
((((const struct unaligned_32 *) (pixels))>l)))  
776 
((((*((uint32_t *) (block))) ^ 
777 
((((const struct unaligned_32 *) (pixels))> 
778 
l))) & 0xFEFEFEFEUL) >> 1)); 
779 
*((uint32_t *) (block + 4)) =

780 
(((*((uint32_t *) (block + 4))) 

781 
((((const struct unaligned_32 *) (pixels + 4))>l)))  
782 
((((*((uint32_t *) (block + 4))) ^

783 
((((const struct unaligned_32 *) (pixels + 
784 
4))>

785 
l))) & 0xFEFEFEFEUL) >> 1)); 
786 
pixels += line_size; 
787 
block += line_size; 
788 
} 
789 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);

790  
791 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
792 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
793 
int i;

794  
795 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);

796 

797 
for (i = 0; i < h; i++) { 
798 
/*

799 
block is 8 bytesaligned, so we're either in the

800 
left block (16 bytesaligned) or in the right block (not)

801 
*/

802 
int rightside = ((unsigned long)block & 0x0000000F); 
803 

804 
blockv = vec_ld(0, block);

805 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
806 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
807 
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

808 

809 
if (rightside)

810 
{ 
811 
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 
812 
} 
813 
else

814 
{ 
815 
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 
816 
} 
817 

818 
blockv = vec_avg(blockv, pixelsv); 
819  
820 
vec_st(blockv, 0, block);

821 

822 
pixels += line_size; 
823 
block += line_size; 
824 
} 
825 

826 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);

827 

828 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
829 
} 
830  
831 
/* next one assumes that ((line_size % 8) == 0) */

832 
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
833 
{ 
834 
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);

835 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

836 
int j;

837 
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);

838 
for (j = 0; j < 2; j++) { 
839 
int i;

840 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
841 
const uint32_t b =

842 
(((const struct unaligned_32 *) (pixels + 1))>l); 
843 
uint32_t l0 = 
844 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
845 
uint32_t h0 = 
846 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
847 
uint32_t l1, h1; 
848 
pixels += line_size; 
849 
for (i = 0; i < h; i += 2) { 
850 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
851 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
852 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
853 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
854 
*((uint32_t *) block) = 
855 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
856 
pixels += line_size; 
857 
block += line_size; 
858 
a = (((const struct unaligned_32 *) (pixels))>l); 
859 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
860 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
861 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
862 
*((uint32_t *) block) = 
863 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
864 
pixels += line_size; 
865 
block += line_size; 
866 
} pixels += 4  line_size * (h + 1); 
867 
block += 4  line_size * h;

868 
} 
869  
870 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);

871  
872 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
873 
register int i; 
874 
register vector unsigned char 
875 
pixelsv1, pixelsv2, 
876 
pixelsavg; 
877 
register vector unsigned char 
878 
blockv, temp1, temp2; 
879 
register vector unsigned short 
880 
pixelssum1, pixelssum2, temp3; 
881 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
882 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
883 

884 
temp1 = vec_ld(0, pixels);

885 
temp2 = vec_ld(16, pixels);

886 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

887 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
888 
{ 
889 
pixelsv2 = temp2; 
890 
} 
891 
else

892 
{ 
893 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

894 
} 
895 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
896 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
897 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
898 
(vector unsigned short)pixelsv2); 
899 
pixelssum1 = vec_add(pixelssum1, vctwo); 
900 

901 
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);

902 
for (i = 0; i < h ; i++) { 
903 
int rightside = ((unsigned long)block & 0x0000000F); 
904 
blockv = vec_ld(0, block);

905  
906 
temp1 = vec_ld(line_size, pixels); 
907 
temp2 = vec_ld(line_size + 16, pixels);

908 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
909 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
910 
{ 
911 
pixelsv2 = temp2; 
912 
} 
913 
else

914 
{ 
915 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

916 
} 
917  
918 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
919 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
920 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
921 
(vector unsigned short)pixelsv2); 
922 
temp3 = vec_add(pixelssum1, pixelssum2); 
923 
temp3 = vec_sra(temp3, vctwo); 
924 
pixelssum1 = vec_add(pixelssum2, vctwo); 
925 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
926 

927 
if (rightside)

928 
{ 
929 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
930 
} 
931 
else

932 
{ 
933 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
934 
} 
935 

936 
vec_st(blockv, 0, block);

937 

938 
block += line_size; 
939 
pixels += line_size; 
940 
} 
941 

942 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);

943 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
944 
} 
945  
946 
/* next one assumes that ((line_size % 8) == 0) */

947 
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
948 
{ 
949 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);

950 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

951 
int j;

952 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

953 
for (j = 0; j < 2; j++) { 
954 
int i;

955 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
956 
const uint32_t b =

957 
(((const struct unaligned_32 *) (pixels + 1))>l); 
958 
uint32_t l0 = 
959 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
960 
uint32_t h0 = 
961 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
962 
uint32_t l1, h1; 
963 
pixels += line_size; 
964 
for (i = 0; i < h; i += 2) { 
965 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
966 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
967 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
968 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
969 
*((uint32_t *) block) = 
970 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
971 
pixels += line_size; 
972 
block += line_size; 
973 
a = (((const struct unaligned_32 *) (pixels))>l); 
974 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
975 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
976 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
977 
*((uint32_t *) block) = 
978 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
979 
pixels += line_size; 
980 
block += line_size; 
981 
} pixels += 4  line_size * (h + 1); 
982 
block += 4  line_size * h;

983 
} 
984 

985 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

986  
987 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
988 
register int i; 
989 
register vector unsigned char 
990 
pixelsv1, pixelsv2, 
991 
pixelsavg; 
992 
register vector unsigned char 
993 
blockv, temp1, temp2; 
994 
register vector unsigned short 
995 
pixelssum1, pixelssum2, temp3; 
996 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
997 
register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); 
998 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
999 

1000 
temp1 = vec_ld(0, pixels);

1001 
temp2 = vec_ld(16, pixels);

1002 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1003 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
1004 
{ 
1005 
pixelsv2 = temp2; 
1006 
} 
1007 
else

1008 
{ 
1009 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1010 
} 
1011 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1012 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1013 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1014 
(vector unsigned short)pixelsv2); 
1015 
pixelssum1 = vec_add(pixelssum1, vcone); 
1016 

1017 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

1018 
for (i = 0; i < h ; i++) { 
1019 
int rightside = ((unsigned long)block & 0x0000000F); 
1020 
blockv = vec_ld(0, block);

1021  
1022 
temp1 = vec_ld(line_size, pixels); 
1023 
temp2 = vec_ld(line_size + 16, pixels);

1024 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1025 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1026 
{ 
1027 
pixelsv2 = temp2; 
1028 
} 
1029 
else

1030 
{ 
1031 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1032 
} 
1033  
1034 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1035 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1036 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1037 
(vector unsigned short)pixelsv2); 
1038 
temp3 = vec_add(pixelssum1, pixelssum2); 
1039 
temp3 = vec_sra(temp3, vctwo); 
1040 
pixelssum1 = vec_add(pixelssum2, vcone); 
1041 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
1042 

1043 
if (rightside)

1044 
{ 
1045 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
1046 
} 
1047 
else

1048 
{ 
1049 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
1050 
} 
1051 

1052 
vec_st(blockv, 0, block);

1053 

1054 
block += line_size; 
1055 
pixels += line_size; 
1056 
} 
1057 

1058 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

1059 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1060 
} 
1061  
1062 
/* next one assumes that ((line_size % 16) == 0) */

1063 
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
1064 
{ 
1065 
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);

1066 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

1067 
int j;

1068 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);

1069 
for (j = 0; j < 4; j++) { 
1070 
int i;

1071 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1072 
const uint32_t b =

1073 
(((const struct unaligned_32 *) (pixels + 1))>l); 
1074 
uint32_t l0 = 
1075 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
1076 
uint32_t h0 = 
1077 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1078 
uint32_t l1, h1; 
1079 
pixels += line_size; 
1080 
for (i = 0; i < h; i += 2) { 
1081 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1082 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1083 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
1084 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1085 
*((uint32_t *) block) = 
1086 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1087 
pixels += line_size; 
1088 
block += line_size; 
1089 
a = (((const struct unaligned_32 *) (pixels))>l); 
1090 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1091 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; 
1092 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1093 
*((uint32_t *) block) = 
1094 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1095 
pixels += line_size; 
1096 
block += line_size; 
1097 
} pixels += 4  line_size * (h + 1); 
1098 
block += 4  line_size * h;

1099 
} 
1100  
1101 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);

1102  
1103 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1104 
register int i; 
1105 
register vector unsigned char 
1106 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
1107 
register vector unsigned char 
1108 
blockv, temp1, temp2; 
1109 
register vector unsigned short 
1110 
pixelssum1, pixelssum2, temp3, 
1111 
pixelssum3, pixelssum4, temp4; 
1112 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
1113 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
1114  
1115 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);

1116 

1117 
temp1 = vec_ld(0, pixels);

1118 
temp2 = vec_ld(16, pixels);

1119 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1120 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
1121 
{ 
1122 
pixelsv2 = temp2; 
1123 
} 
1124 
else

1125 
{ 
1126 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1127 
} 
1128 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1129 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1130 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1131 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1132 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
1133 
(vector unsigned short)pixelsv4); 
1134 
pixelssum3 = vec_add(pixelssum3, vctwo); 
1135 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1136 
(vector unsigned short)pixelsv2); 
1137 
pixelssum1 = vec_add(pixelssum1, vctwo); 
1138 

1139 
for (i = 0; i < h ; i++) { 
1140 
blockv = vec_ld(0, block);

1141  
1142 
temp1 = vec_ld(line_size, pixels); 
1143 
temp2 = vec_ld(line_size + 16, pixels);

1144 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1145 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1146 
{ 
1147 
pixelsv2 = temp2; 
1148 
} 
1149 
else

1150 
{ 
1151 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1152 
} 
1153  
1154 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1155 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1156 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1157 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1158 

1159 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
1160 
(vector unsigned short)pixelsv4); 
1161 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1162 
(vector unsigned short)pixelsv2); 
1163 
temp4 = vec_add(pixelssum3, pixelssum4); 
1164 
temp4 = vec_sra(temp4, vctwo); 
1165 
temp3 = vec_add(pixelssum1, pixelssum2); 
1166 
temp3 = vec_sra(temp3, vctwo); 
1167  
1168 
pixelssum3 = vec_add(pixelssum4, vctwo); 
1169 
pixelssum1 = vec_add(pixelssum2, vctwo); 
1170  
1171 
blockv = vec_packsu(temp3, temp4); 
1172 

1173 
vec_st(blockv, 0, block);

1174 

1175 
block += line_size; 
1176 
pixels += line_size; 
1177 
} 
1178 

1179 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);

1180 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1181 
} 
1182  
1183 
/* next one assumes that ((line_size % 16) == 0) */

1184 
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
1185 
{ 
1186 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);

1187 
#ifdef ALTIVEC_USE_REFERENCE_C_CODE

1188 
int j;

1189 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1190 
for (j = 0; j < 4; j++) { 
1191 
int i;

1192 
const uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1193 
const uint32_t b =

1194 
(((const struct unaligned_32 *) (pixels + 1))>l); 
1195 
uint32_t l0 = 
1196 
(a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
1197 
uint32_t h0 = 
1198 
((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1199 
uint32_t l1, h1; 
1200 
pixels += line_size; 
1201 
for (i = 0; i < h; i += 2) { 
1202 
uint32_t a = (((const struct unaligned_32 *) (pixels))>l); 
1203 
uint32_t b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1204 
l1 = (a & 0x03030303UL) + (b & 0x03030303UL); 
1205 
h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1206 
*((uint32_t *) block) = 
1207 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1208 
pixels += line_size; 
1209 
block += line_size; 
1210 
a = (((const struct unaligned_32 *) (pixels))>l); 
1211 
b = (((const struct unaligned_32 *) (pixels + 1))>l); 
1212 
l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; 
1213 
h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); 
1214 
*((uint32_t *) block) = 
1215 
h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); 
1216 
pixels += line_size; 
1217 
block += line_size; 
1218 
} pixels += 4  line_size * (h + 1); 
1219 
block += 4  line_size * h;

1220 
} 
1221  
1222 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1223  
1224 
#else /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1225 
register int i; 
1226 
register vector unsigned char 
1227 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
1228 
register vector unsigned char 
1229 
blockv, temp1, temp2; 
1230 
register vector unsigned short 
1231 
pixelssum1, pixelssum2, temp3, 
1232 
pixelssum3, pixelssum4, temp4; 
1233 
register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); 
1234 
register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); 
1235 
register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); 
1236  
1237 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1238 

1239 
temp1 = vec_ld(0, pixels);

1240 
temp2 = vec_ld(16, pixels);

1241 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1242 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
1243 
{ 
1244 
pixelsv2 = temp2; 
1245 
} 
1246 
else

1247 
{ 
1248 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1249 
} 
1250 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1251 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1252 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1253 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1254 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
1255 
(vector unsigned short)pixelsv4); 
1256 
pixelssum3 = vec_add(pixelssum3, vcone); 
1257 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1258 
(vector unsigned short)pixelsv2); 
1259 
pixelssum1 = vec_add(pixelssum1, vcone); 
1260 

1261 
for (i = 0; i < h ; i++) { 
1262 
blockv = vec_ld(0, block);

1263  
1264 
temp1 = vec_ld(line_size, pixels); 
1265 
temp2 = vec_ld(line_size + 16, pixels);

1266 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1267 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1268 
{ 
1269 
pixelsv2 = temp2; 
1270 
} 
1271 
else

1272 
{ 
1273 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1274 
} 
1275  
1276 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1277 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1278 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1279 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1280 

1281 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
1282 
(vector unsigned short)pixelsv4); 
1283 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1284 
(vector unsigned short)pixelsv2); 
1285 
temp4 = vec_add(pixelssum3, pixelssum4); 
1286 
temp4 = vec_sra(temp4, vctwo); 
1287 
temp3 = vec_add(pixelssum1, pixelssum2); 
1288 
temp3 = vec_sra(temp3, vctwo); 
1289  
1290 
pixelssum3 = vec_add(pixelssum4, vcone); 
1291 
pixelssum1 = vec_add(pixelssum2, vcone); 
1292  
1293 
blockv = vec_packsu(temp3, temp4); 
1294 

1295 
vec_st(blockv, 0, block);

1296 

1297 
block += line_size; 
1298 
pixels += line_size; 
1299 
} 
1300 

1301 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1302 
#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ 
1303 
} 
1304  
1305 
int has_altivec(void) 
1306 
{ 
1307 
#ifdef CONFIG_DARWIN

1308 
int sels[2] = {CTL_HW, HW_VECTORUNIT}; 
1309 
int has_vu = 0; 
1310 
size_t len = sizeof(has_vu);

1311 
int err;

1312  
1313 
err = sysctl(sels, 2, &has_vu, &len, NULL, 0); 
1314  
1315 
if (err == 0) return (has_vu != 0); 
1316 
#else /* CONFIG_DARWIN */ 
1317 
/* no Darwin, do it the bruteforce way */

1318 
/* this is borrowed from the libmpeg2 library */

1319 
{ 
1320 
signal (SIGILL, sigill_handler); 
1321 
if (sigsetjmp (jmpbuf, 1)) { 
1322 
signal (SIGILL, SIG_DFL); 
1323 
} else {

1324 
canjump = 1;

1325 

1326 
asm volatile ("mtspr 256, %0\n\t" 
1327 
"vand %%v0, %%v0, %%v0"

1328 
: 
1329 
: "r" (1)); 
1330 

1331 
signal (SIGILL, SIG_DFL); 
1332 
return 1; 
1333 
} 
1334 
} 
1335 
#endif /* CONFIG_DARWIN */ 
1336 
return 0; 
1337 
} 