ffmpeg / libavcodec / ppc / dsputil_altivec.c @ 0c893224
History  View  Annotate  Download (56.6 KB)
1 
/*


2 
* Copyright (c) 2002 Brian Foley

3 
* Copyright (c) 2002 Dieter Shirley

4 
* Copyright (c) 20032004 Romain Dolbeau <romain@dolbeau.org>

5 
*

6 
* This file is part of FFmpeg.

7 
*

8 
* FFmpeg is free software; you can redistribute it and/or

9 
* modify it under the terms of the GNU Lesser General Public

10 
* License as published by the Free Software Foundation; either

11 
* version 2.1 of the License, or (at your option) any later version.

12 
*

13 
* FFmpeg is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16 
* Lesser General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU Lesser General Public

19 
* License along with FFmpeg; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*/

22  
23 
#include "dsputil.h" 
24  
25 
#include "gcc_fixes.h" 
26  
27 
#include "dsputil_ppc.h" 
28 
#include "util_altivec.h" 
29  
30 
int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
31 
{ 
32 
int i;

33 
DECLARE_ALIGNED_16(int, s);

34 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
35 
vector unsigned char *tv; 
36 
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 
37 
vector unsigned int sad; 
38 
vector signed int sumdiffs; 
39  
40 
s = 0;

41 
sad = (vector unsigned int)vec_splat_u32(0); 
42 
for(i=0;i<h;i++) { 
43 
/*

44 
Read unaligned pixels into our vectors. The vectors are as follows:

45 
pix1v: pix1[0]pix1[15]

46 
pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

47 
*/

48 
tv = (vector unsigned char *) pix1; 
49 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
50  
51 
tv = (vector unsigned char *) &pix2[0]; 
52 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
53  
54 
tv = (vector unsigned char *) &pix2[1]; 
55 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
56  
57 
/* Calculate the average vector */

58 
avgv = vec_avg(pix2v, pix2iv); 
59  
60 
/* Calculate a sum of abs differences vector */

61 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
62  
63 
/* Add each 4 pixel group together and put 4 results into sad */

64 
sad = vec_sum4s(t5, sad); 
65  
66 
pix1 += line_size; 
67 
pix2 += line_size; 
68 
} 
69 
/* Sum up the four partial sums, and put the result into s */

70 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
71 
sumdiffs = vec_splat(sumdiffs, 3);

72 
vec_ste(sumdiffs, 0, &s);

73  
74 
return s;

75 
} 
76  
77 
int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
78 
{ 
79 
int i;

80 
DECLARE_ALIGNED_16(int, s);

81 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
82 
vector unsigned char *tv; 
83 
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 
84 
vector unsigned int sad; 
85 
vector signed int sumdiffs; 
86 
uint8_t *pix3 = pix2 + line_size; 
87  
88 
s = 0;

89 
sad = (vector unsigned int)vec_splat_u32(0); 
90  
91 
/*

92 
Due to the fact that pix3 = pix2 + line_size, the pix3 of one

93 
iteration becomes pix2 in the next iteration. We can use this

94 
fact to avoid a potentially expensive unaligned read, each

95 
time around the loop.

96 
Read unaligned pixels into our vectors. The vectors are as follows:

97 
pix2v: pix2[0]pix2[15]

98 
Split the pixel vectors into shorts

99 
*/

100 
tv = (vector unsigned char *) &pix2[0]; 
101 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
102  
103 
for(i=0;i<h;i++) { 
104 
/*

105 
Read unaligned pixels into our vectors. The vectors are as follows:

106 
pix1v: pix1[0]pix1[15]

107 
pix3v: pix3[0]pix3[15]

108 
*/

109 
tv = (vector unsigned char *) pix1; 
110 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
111  
112 
tv = (vector unsigned char *) &pix3[0]; 
113 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
114  
115 
/* Calculate the average vector */

116 
avgv = vec_avg(pix2v, pix3v); 
117  
118 
/* Calculate a sum of abs differences vector */

119 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
120  
121 
/* Add each 4 pixel group together and put 4 results into sad */

122 
sad = vec_sum4s(t5, sad); 
123  
124 
pix1 += line_size; 
125 
pix2v = pix3v; 
126 
pix3 += line_size; 
127  
128 
} 
129  
130 
/* Sum up the four partial sums, and put the result into s */

131 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
132 
sumdiffs = vec_splat(sumdiffs, 3);

133 
vec_ste(sumdiffs, 0, &s);

134 
return s;

135 
} 
136  
137 
int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
138 
{ 
139 
int i;

140 
DECLARE_ALIGNED_16(int, s);

141 
uint8_t *pix3 = pix2 + line_size; 
142 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
143 
const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); 
144 
vector unsigned char *tv, avgv, t5; 
145 
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 
146 
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 
147 
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 
148 
vector unsigned short avghv, avglv; 
149 
vector unsigned short t1, t2, t3, t4; 
150 
vector unsigned int sad; 
151 
vector signed int sumdiffs; 
152  
153 
sad = (vector unsigned int)vec_splat_u32(0); 
154  
155 
s = 0;

156  
157 
/*

158 
Due to the fact that pix3 = pix2 + line_size, the pix3 of one

159 
iteration becomes pix2 in the next iteration. We can use this

160 
fact to avoid a potentially expensive unaligned read, as well

161 
as some splitting, and vector addition each time around the loop.

162 
Read unaligned pixels into our vectors. The vectors are as follows:

163 
pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

164 
Split the pixel vectors into shorts

165 
*/

166 
tv = (vector unsigned char *) &pix2[0]; 
167 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
168  
169 
tv = (vector unsigned char *) &pix2[1]; 
170 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
171  
172 
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 
173 
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 
174 
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 
175 
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 
176 
t1 = vec_add(pix2hv, pix2ihv); 
177 
t2 = vec_add(pix2lv, pix2ilv); 
178  
179 
for(i=0;i<h;i++) { 
180 
/*

181 
Read unaligned pixels into our vectors. The vectors are as follows:

182 
pix1v: pix1[0]pix1[15]

183 
pix3v: pix3[0]pix3[15] pix3iv: pix3[1]pix3[16]

184 
*/

185 
tv = (vector unsigned char *) pix1; 
186 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
187  
188 
tv = (vector unsigned char *) &pix3[0]; 
189 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
190  
191 
tv = (vector unsigned char *) &pix3[1]; 
192 
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 
193  
194 
/*

195 
Note that AltiVec does have vec_avg, but this works on vector pairs

196 
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding

197 
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.

198 
Instead, we have to split the pixel vectors into vectors of shorts,

199 
and do the averaging by hand.

200 
*/

201  
202 
/* Split the pixel vectors into shorts */

203 
pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 
204 
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 
205 
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 
206 
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 
207  
208 
/* Do the averaging on them */

209 
t3 = vec_add(pix3hv, pix3ihv); 
210 
t4 = vec_add(pix3lv, pix3ilv); 
211  
212 
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 
213 
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 
214  
215 
/* Pack the shorts back into a result */

216 
avgv = vec_pack(avghv, avglv); 
217  
218 
/* Calculate a sum of abs differences vector */

219 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
220  
221 
/* Add each 4 pixel group together and put 4 results into sad */

222 
sad = vec_sum4s(t5, sad); 
223  
224 
pix1 += line_size; 
225 
pix3 += line_size; 
226 
/* Transfer the calculated values for pix3 into pix2 */

227 
t1 = t3; 
228 
t2 = t4; 
229 
} 
230 
/* Sum up the four partial sums, and put the result into s */

231 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
232 
sumdiffs = vec_splat(sumdiffs, 3);

233 
vec_ste(sumdiffs, 0, &s);

234  
235 
return s;

236 
} 
237  
238 
int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
239 
{ 
240 
int i;

241 
DECLARE_ALIGNED_16(int, s);

242 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
243 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
244 
vector unsigned char t1, t2, t3,t4, t5; 
245 
vector unsigned int sad; 
246 
vector signed int sumdiffs; 
247  
248 
sad = (vector unsigned int)vec_splat_u32(0); 
249  
250  
251 
for(i=0;i<h;i++) { 
252 
/* Read potentially unaligned pixels into t1 and t2 */

253 
perm1 = vec_lvsl(0, pix1);

254 
pix1v = (vector unsigned char *) pix1; 
255 
perm2 = vec_lvsl(0, pix2);

256 
pix2v = (vector unsigned char *) pix2; 
257 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
258 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
259  
260 
/* Calculate a sum of abs differences vector */

261 
t3 = vec_max(t1, t2); 
262 
t4 = vec_min(t1, t2); 
263 
t5 = vec_sub(t3, t4); 
264  
265 
/* Add each 4 pixel group together and put 4 results into sad */

266 
sad = vec_sum4s(t5, sad); 
267  
268 
pix1 += line_size; 
269 
pix2 += line_size; 
270 
} 
271  
272 
/* Sum up the four partial sums, and put the result into s */

273 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
274 
sumdiffs = vec_splat(sumdiffs, 3);

275 
vec_ste(sumdiffs, 0, &s);

276  
277 
return s;

278 
} 
279  
280 
int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
281 
{ 
282 
int i;

283 
DECLARE_ALIGNED_16(int, s);

284 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
285 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
286 
vector unsigned char t1, t2, t3,t4, t5; 
287 
vector unsigned int sad; 
288 
vector signed int sumdiffs; 
289  
290 
sad = (vector unsigned int)vec_splat_u32(0); 
291  
292 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
293  
294 
for(i=0;i<h;i++) { 
295 
/* Read potentially unaligned pixels into t1 and t2

296 
Since we're reading 16 pixels, and actually only want 8,

297 
mask out the last 8 pixels. The 0s don't change the sum. */

298 
perm1 = vec_lvsl(0, pix1);

299 
pix1v = (vector unsigned char *) pix1; 
300 
perm2 = vec_lvsl(0, pix2);

301 
pix2v = (vector unsigned char *) pix2; 
302 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
303 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
304  
305 
/* Calculate a sum of abs differences vector */

306 
t3 = vec_max(t1, t2); 
307 
t4 = vec_min(t1, t2); 
308 
t5 = vec_sub(t3, t4); 
309  
310 
/* Add each 4 pixel group together and put 4 results into sad */

311 
sad = vec_sum4s(t5, sad); 
312  
313 
pix1 += line_size; 
314 
pix2 += line_size; 
315 
} 
316  
317 
/* Sum up the four partial sums, and put the result into s */

318 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
319 
sumdiffs = vec_splat(sumdiffs, 3);

320 
vec_ste(sumdiffs, 0, &s);

321  
322 
return s;

323 
} 
324  
325 
int pix_norm1_altivec(uint8_t *pix, int line_size) 
326 
{ 
327 
int i;

328 
DECLARE_ALIGNED_16(int, s);

329 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
330 
vector unsigned char *tv; 
331 
vector unsigned char pixv; 
332 
vector unsigned int sv; 
333 
vector signed int sum; 
334  
335 
sv = (vector unsigned int)vec_splat_u32(0); 
336  
337 
s = 0;

338 
for (i = 0; i < 16; i++) { 
339 
/* Read in the potentially unaligned pixels */

340 
tv = (vector unsigned char *) pix; 
341 
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 
342  
343 
/* Square the values, and add them to our sum */

344 
sv = vec_msum(pixv, pixv, sv); 
345  
346 
pix += line_size; 
347 
} 
348 
/* Sum up the four partial sums, and put the result into s */

349 
sum = vec_sums((vector signed int) sv, (vector signed int) zero); 
350 
sum = vec_splat(sum, 3);

351 
vec_ste(sum, 0, &s);

352  
353 
return s;

354 
} 
355  
356 
/**

357 
* Sum of Squared Errors for a 8x8 block.

358 
* AltiVecenhanced.

359 
* It's the sad8_altivec code above w/ squaring added.

360 
*/

361 
int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
362 
{ 
363 
int i;

364 
DECLARE_ALIGNED_16(int, s);

365 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
366 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
367 
vector unsigned char t1, t2, t3,t4, t5; 
368 
vector unsigned int sum; 
369 
vector signed int sumsqr; 
370  
371 
sum = (vector unsigned int)vec_splat_u32(0); 
372  
373 
permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); 
374  
375  
376 
for(i=0;i<h;i++) { 
377 
/* Read potentially unaligned pixels into t1 and t2

378 
Since we're reading 16 pixels, and actually only want 8,

379 
mask out the last 8 pixels. The 0s don't change the sum. */

380 
perm1 = vec_lvsl(0, pix1);

381 
pix1v = (vector unsigned char *) pix1; 
382 
perm2 = vec_lvsl(0, pix2);

383 
pix2v = (vector unsigned char *) pix2; 
384 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
385 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
386  
387 
/*

388 
Since we want to use unsigned chars, we can take advantage

389 
of the fact that abs(ab)^2 = (ab)^2.

390 
*/

391  
392 
/* Calculate abs differences vector */

393 
t3 = vec_max(t1, t2); 
394 
t4 = vec_min(t1, t2); 
395 
t5 = vec_sub(t3, t4); 
396  
397 
/* Square the values and add them to our sum */

398 
sum = vec_msum(t5, t5, sum); 
399  
400 
pix1 += line_size; 
401 
pix2 += line_size; 
402 
} 
403  
404 
/* Sum up the four partial sums, and put the result into s */

405 
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
406 
sumsqr = vec_splat(sumsqr, 3);

407 
vec_ste(sumsqr, 0, &s);

408  
409 
return s;

410 
} 
411  
412 
/**

413 
* Sum of Squared Errors for a 16x16 block.

414 
* AltiVecenhanced.

415 
* It's the sad16_altivec code above w/ squaring added.

416 
*/

417 
int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
418 
{ 
419 
int i;

420 
DECLARE_ALIGNED_16(int, s);

421 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
422 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
423 
vector unsigned char t1, t2, t3,t4, t5; 
424 
vector unsigned int sum; 
425 
vector signed int sumsqr; 
426  
427 
sum = (vector unsigned int)vec_splat_u32(0); 
428  
429 
for(i=0;i<h;i++) { 
430 
/* Read potentially unaligned pixels into t1 and t2 */

431 
perm1 = vec_lvsl(0, pix1);

432 
pix1v = (vector unsigned char *) pix1; 
433 
perm2 = vec_lvsl(0, pix2);

434 
pix2v = (vector unsigned char *) pix2; 
435 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
436 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
437  
438 
/*

439 
Since we want to use unsigned chars, we can take advantage

440 
of the fact that abs(ab)^2 = (ab)^2.

441 
*/

442  
443 
/* Calculate abs differences vector */

444 
t3 = vec_max(t1, t2); 
445 
t4 = vec_min(t1, t2); 
446 
t5 = vec_sub(t3, t4); 
447  
448 
/* Square the values and add them to our sum */

449 
sum = vec_msum(t5, t5, sum); 
450  
451 
pix1 += line_size; 
452 
pix2 += line_size; 
453 
} 
454  
455 
/* Sum up the four partial sums, and put the result into s */

456 
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
457 
sumsqr = vec_splat(sumsqr, 3);

458 
vec_ste(sumsqr, 0, &s);

459  
460 
return s;

461 
} 
462  
463 
int pix_sum_altivec(uint8_t * pix, int line_size) 
464 
{ 
465 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
466 
vector unsigned char perm, *pixv; 
467 
vector unsigned char t1; 
468 
vector unsigned int sad; 
469 
vector signed int sumdiffs; 
470  
471 
int i;

472 
DECLARE_ALIGNED_16(int, s);

473  
474 
sad = (vector unsigned int)vec_splat_u32(0); 
475  
476 
for (i = 0; i < 16; i++) { 
477 
/* Read the potentially unaligned 16 pixels into t1 */

478 
perm = vec_lvsl(0, pix);

479 
pixv = (vector unsigned char *) pix; 
480 
t1 = vec_perm(pixv[0], pixv[1], perm); 
481  
482 
/* Add each 4 pixel group together and put 4 results into sad */

483 
sad = vec_sum4s(t1, sad); 
484  
485 
pix += line_size; 
486 
} 
487  
488 
/* Sum up the four partial sums, and put the result into s */

489 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
490 
sumdiffs = vec_splat(sumdiffs, 3);

491 
vec_ste(sumdiffs, 0, &s);

492  
493 
return s;

494 
} 
495  
496 
void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 
497 
{ 
498 
int i;

499 
vector unsigned char perm, bytes, *pixv; 
500 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
501 
vector signed short shorts; 
502  
503 
for(i=0;i<8;i++) 
504 
{ 
505 
// Read potentially unaligned pixels.

506 
// We're reading 16 pixels, and actually only want 8,

507 
// but we simply ignore the extras.

508 
perm = vec_lvsl(0, pixels);

509 
pixv = (vector unsigned char *) pixels; 
510 
bytes = vec_perm(pixv[0], pixv[1], perm); 
511  
512 
// convert the bytes into shorts

513 
shorts = (vector signed short)vec_mergeh(zero, bytes); 
514  
515 
// save the data to the block, we assume the block is 16byte aligned

516 
vec_st(shorts, i*16, (vector signed short*)block); 
517  
518 
pixels += line_size; 
519 
} 
520 
} 
521  
522 
void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 
523 
const uint8_t *s2, int stride) 
524 
{ 
525 
int i;

526 
vector unsigned char perm, bytes, *pixv; 
527 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
528 
vector signed short shorts1, shorts2; 
529  
530 
for(i=0;i<4;i++) 
531 
{ 
532 
// Read potentially unaligned pixels

533 
// We're reading 16 pixels, and actually only want 8,

534 
// but we simply ignore the extras.

535 
perm = vec_lvsl(0, s1);

536 
pixv = (vector unsigned char *) s1; 
537 
bytes = vec_perm(pixv[0], pixv[1], perm); 
538  
539 
// convert the bytes into shorts

540 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
541  
542 
// Do the same for the second block of pixels

543 
perm = vec_lvsl(0, s2);

544 
pixv = (vector unsigned char *) s2; 
545 
bytes = vec_perm(pixv[0], pixv[1], perm); 
546  
547 
// convert the bytes into shorts

548 
shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
549  
550 
// Do the subtraction

551 
shorts1 = vec_sub(shorts1, shorts2); 
552  
553 
// save the data to the block, we assume the block is 16byte aligned

554 
vec_st(shorts1, 0, (vector signed short*)block); 
555  
556 
s1 += stride; 
557 
s2 += stride; 
558 
block += 8;

559  
560  
561 
// The code below is a copy of the code above... This is a manual

562 
// unroll.

563  
564 
// Read potentially unaligned pixels

565 
// We're reading 16 pixels, and actually only want 8,

566 
// but we simply ignore the extras.

567 
perm = vec_lvsl(0, s1);

568 
pixv = (vector unsigned char *) s1; 
569 
bytes = vec_perm(pixv[0], pixv[1], perm); 
570  
571 
// convert the bytes into shorts

572 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
573  
574 
// Do the same for the second block of pixels

575 
perm = vec_lvsl(0, s2);

576 
pixv = (vector unsigned char *) s2; 
577 
bytes = vec_perm(pixv[0], pixv[1], perm); 
578  
579 
// convert the bytes into shorts

580 
shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
581  
582 
// Do the subtraction

583 
shorts1 = vec_sub(shorts1, shorts2); 
584  
585 
// save the data to the block, we assume the block is 16byte aligned

586 
vec_st(shorts1, 0, (vector signed short*)block); 
587  
588 
s1 += stride; 
589 
s2 += stride; 
590 
block += 8;

591 
} 
592 
} 
593  
594 
void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 
595 
register int i; 
596 
register vector unsigned char vdst, vsrc; 
597  
598 
/* dst and src are 16 bytesaligned (guaranteed) */

599 
for(i = 0 ; (i + 15) < w ; i+=16) 
600 
{ 
601 
vdst = vec_ld(i, (unsigned char*)dst); 
602 
vsrc = vec_ld(i, (unsigned char*)src); 
603 
vdst = vec_add(vsrc, vdst); 
604 
vec_st(vdst, i, (unsigned char*)dst); 
605 
} 
606 
/* if w is not a multiple of 16 */

607 
for (; (i < w) ; i++)

608 
{ 
609 
dst[i] = src[i]; 
610 
} 
611 
} 
612  
613 
/* next one assumes that ((line_size % 16) == 0) */

614 
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
615 
{ 
616 
POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);

617 
register vector unsigned char pixelsv1, pixelsv2; 
618 
register vector unsigned char pixelsv1B, pixelsv2B; 
619 
register vector unsigned char pixelsv1C, pixelsv2C; 
620 
register vector unsigned char pixelsv1D, pixelsv2D; 
621  
622 
register vector unsigned char perm = vec_lvsl(0, pixels); 
623 
int i;

624 
register int line_size_2 = line_size << 1; 
625 
register int line_size_3 = line_size + line_size_2; 
626 
register int line_size_4 = line_size << 2; 
627  
628 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);

629 
// handunrolling the loop by 4 gains about 15%

630 
// mininum execution time goes from 74 to 60 cycles

631 
// it's faster than funrollloops, but using

632 
// funrollloops w/ this is bad  74 cycles again.

633 
// all this is on a 7450, tuning for the 7450

634 
#if 0

635 
for(i=0; i<h; i++) {

636 
pixelsv1 = vec_ld(0, (unsigned char*)pixels);

637 
pixelsv2 = vec_ld(16, (unsigned char*)pixels);

638 
vec_st(vec_perm(pixelsv1, pixelsv2, perm),

639 
0, (unsigned char*)block);

640 
pixels+=line_size;

641 
block +=line_size;

642 
}

643 
#else

644 
for(i=0; i<h; i+=4) { 
645 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
646 
pixelsv2 = vec_ld(15, (unsigned char*)pixels); 
647 
pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); 
648 
pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels); 
649 
pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); 
650 
pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels); 
651 
pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); 
652 
pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels); 
653 
vec_st(vec_perm(pixelsv1, pixelsv2, perm), 
654 
0, (unsigned char*)block); 
655 
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 
656 
line_size, (unsigned char*)block); 
657 
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 
658 
line_size_2, (unsigned char*)block); 
659 
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 
660 
line_size_3, (unsigned char*)block); 
661 
pixels+=line_size_4; 
662 
block +=line_size_4; 
663 
} 
664 
#endif

665 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);

666 
} 
667  
668 
/* next one assumes that ((line_size % 16) == 0) */

669 
#define op_avg(a,b) a = ( ((a)(b))  ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 
670 
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
671 
{ 
672 
POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);

673 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
674 
register vector unsigned char perm = vec_lvsl(0, pixels); 
675 
int i;

676  
677 
POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);

678  
679 
for(i=0; i<h; i++) { 
680 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
681 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
682 
blockv = vec_ld(0, block);

683 
pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 
684 
blockv = vec_avg(blockv,pixelsv); 
685 
vec_st(blockv, 0, (unsigned char*)block); 
686 
pixels+=line_size; 
687 
block +=line_size; 
688 
} 
689  
690 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);

691 
} 
692  
693 
/* next one assumes that ((line_size % 8) == 0) */

694 
void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
695 
{ 
696 
POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);

697 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
698 
int i;

699  
700 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);

701  
702 
for (i = 0; i < h; i++) { 
703 
/*

704 
block is 8 bytesaligned, so we're either in the

705 
left block (16 bytesaligned) or in the right block (not)

706 
*/

707 
int rightside = ((unsigned long)block & 0x0000000F); 
708  
709 
blockv = vec_ld(0, block);

710 
pixelsv1 = vec_ld(0, (unsigned char*)pixels); 
711 
pixelsv2 = vec_ld(16, (unsigned char*)pixels); 
712 
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

713  
714 
if (rightside)

715 
{ 
716 
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 
717 
} 
718 
else

719 
{ 
720 
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 
721 
} 
722  
723 
blockv = vec_avg(blockv, pixelsv); 
724  
725 
vec_st(blockv, 0, block);

726  
727 
pixels += line_size; 
728 
block += line_size; 
729 
} 
730  
731 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);

732 
} 
733  
734 
/* next one assumes that ((line_size % 8) == 0) */

735 
void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
736 
{ 
737 
POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);

738 
register int i; 
739 
register vector unsigned char 
740 
pixelsv1, pixelsv2, 
741 
pixelsavg; 
742 
register vector unsigned char 
743 
blockv, temp1, temp2; 
744 
register vector unsigned short 
745 
pixelssum1, pixelssum2, temp3; 
746 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
747 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
748  
749 
temp1 = vec_ld(0, pixels);

750 
temp2 = vec_ld(16, pixels);

751 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

752 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
753 
{ 
754 
pixelsv2 = temp2; 
755 
} 
756 
else

757 
{ 
758 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

759 
} 
760 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
761 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
762 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
763 
(vector unsigned short)pixelsv2); 
764 
pixelssum1 = vec_add(pixelssum1, vctwo); 
765  
766 
POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);

767 
for (i = 0; i < h ; i++) { 
768 
int rightside = ((unsigned long)block & 0x0000000F); 
769 
blockv = vec_ld(0, block);

770  
771 
temp1 = vec_ld(line_size, pixels); 
772 
temp2 = vec_ld(line_size + 16, pixels);

773 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
774 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
775 
{ 
776 
pixelsv2 = temp2; 
777 
} 
778 
else

779 
{ 
780 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

781 
} 
782  
783 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
784 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
785 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
786 
(vector unsigned short)pixelsv2); 
787 
temp3 = vec_add(pixelssum1, pixelssum2); 
788 
temp3 = vec_sra(temp3, vctwo); 
789 
pixelssum1 = vec_add(pixelssum2, vctwo); 
790 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
791  
792 
if (rightside)

793 
{ 
794 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
795 
} 
796 
else

797 
{ 
798 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
799 
} 
800  
801 
vec_st(blockv, 0, block);

802  
803 
block += line_size; 
804 
pixels += line_size; 
805 
} 
806  
807 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);

808 
} 
809  
810 
/* next one assumes that ((line_size % 8) == 0) */

811 
void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
812 
{ 
813 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);

814 
register int i; 
815 
register vector unsigned char 
816 
pixelsv1, pixelsv2, 
817 
pixelsavg; 
818 
register vector unsigned char 
819 
blockv, temp1, temp2; 
820 
register vector unsigned short 
821 
pixelssum1, pixelssum2, temp3; 
822 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
823 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
824 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
825  
826 
temp1 = vec_ld(0, pixels);

827 
temp2 = vec_ld(16, pixels);

828 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

829 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
830 
{ 
831 
pixelsv2 = temp2; 
832 
} 
833 
else

834 
{ 
835 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

836 
} 
837 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
838 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
839 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
840 
(vector unsigned short)pixelsv2); 
841 
pixelssum1 = vec_add(pixelssum1, vcone); 
842  
843 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

844 
for (i = 0; i < h ; i++) { 
845 
int rightside = ((unsigned long)block & 0x0000000F); 
846 
blockv = vec_ld(0, block);

847  
848 
temp1 = vec_ld(line_size, pixels); 
849 
temp2 = vec_ld(line_size + 16, pixels);

850 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
851 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
852 
{ 
853 
pixelsv2 = temp2; 
854 
} 
855 
else

856 
{ 
857 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

858 
} 
859  
860 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
861 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
862 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
863 
(vector unsigned short)pixelsv2); 
864 
temp3 = vec_add(pixelssum1, pixelssum2); 
865 
temp3 = vec_sra(temp3, vctwo); 
866 
pixelssum1 = vec_add(pixelssum2, vcone); 
867 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
868  
869 
if (rightside)

870 
{ 
871 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
872 
} 
873 
else

874 
{ 
875 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
876 
} 
877  
878 
vec_st(blockv, 0, block);

879  
880 
block += line_size; 
881 
pixels += line_size; 
882 
} 
883  
884 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);

885 
} 
886  
887 
/* next one assumes that ((line_size % 16) == 0) */

888 
void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
889 
{ 
890 
POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);

891 
register int i; 
892 
register vector unsigned char 
893 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
894 
register vector unsigned char 
895 
blockv, temp1, temp2; 
896 
register vector unsigned short 
897 
pixelssum1, pixelssum2, temp3, 
898 
pixelssum3, pixelssum4, temp4; 
899 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
900 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
901  
902 
POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);

903  
904 
temp1 = vec_ld(0, pixels);

905 
temp2 = vec_ld(16, pixels);

906 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

907 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
908 
{ 
909 
pixelsv2 = temp2; 
910 
} 
911 
else

912 
{ 
913 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

914 
} 
915 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
916 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
917 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
918 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
919 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
920 
(vector unsigned short)pixelsv4); 
921 
pixelssum3 = vec_add(pixelssum3, vctwo); 
922 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
923 
(vector unsigned short)pixelsv2); 
924 
pixelssum1 = vec_add(pixelssum1, vctwo); 
925  
926 
for (i = 0; i < h ; i++) { 
927 
blockv = vec_ld(0, block);

928  
929 
temp1 = vec_ld(line_size, pixels); 
930 
temp2 = vec_ld(line_size + 16, pixels);

931 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
932 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
933 
{ 
934 
pixelsv2 = temp2; 
935 
} 
936 
else

937 
{ 
938 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

939 
} 
940  
941 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
942 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
943 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
944 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
945  
946 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
947 
(vector unsigned short)pixelsv4); 
948 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
949 
(vector unsigned short)pixelsv2); 
950 
temp4 = vec_add(pixelssum3, pixelssum4); 
951 
temp4 = vec_sra(temp4, vctwo); 
952 
temp3 = vec_add(pixelssum1, pixelssum2); 
953 
temp3 = vec_sra(temp3, vctwo); 
954  
955 
pixelssum3 = vec_add(pixelssum4, vctwo); 
956 
pixelssum1 = vec_add(pixelssum2, vctwo); 
957  
958 
blockv = vec_packsu(temp3, temp4); 
959  
960 
vec_st(blockv, 0, block);

961  
962 
block += line_size; 
963 
pixels += line_size; 
964 
} 
965  
966 
POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);

967 
} 
968  
969 
/* next one assumes that ((line_size % 16) == 0) */

970 
void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
971 
{ 
972 
POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);

973 
register int i; 
974 
register vector unsigned char 
975 
pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
976 
register vector unsigned char 
977 
blockv, temp1, temp2; 
978 
register vector unsigned short 
979 
pixelssum1, pixelssum2, temp3, 
980 
pixelssum3, pixelssum4, temp4; 
981 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
982 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
983 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
984  
985 
POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

986  
987 
temp1 = vec_ld(0, pixels);

988 
temp2 = vec_ld(16, pixels);

989 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

990 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) 
991 
{ 
992 
pixelsv2 = temp2; 
993 
} 
994 
else

995 
{ 
996 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

997 
} 
998 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
999 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1000 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1001 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1002 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
1003 
(vector unsigned short)pixelsv4); 
1004 
pixelssum3 = vec_add(pixelssum3, vcone); 
1005 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1006 
(vector unsigned short)pixelsv2); 
1007 
pixelssum1 = vec_add(pixelssum1, vcone); 
1008  
1009 
for (i = 0; i < h ; i++) { 
1010 
blockv = vec_ld(0, block);

1011  
1012 
temp1 = vec_ld(line_size, pixels); 
1013 
temp2 = vec_ld(line_size + 16, pixels);

1014 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1015 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1016 
{ 
1017 
pixelsv2 = temp2; 
1018 
} 
1019 
else

1020 
{ 
1021 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1022 
} 
1023  
1024 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
1025 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
1026 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1027 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1028  
1029 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
1030 
(vector unsigned short)pixelsv4); 
1031 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1032 
(vector unsigned short)pixelsv2); 
1033 
temp4 = vec_add(pixelssum3, pixelssum4); 
1034 
temp4 = vec_sra(temp4, vctwo); 
1035 
temp3 = vec_add(pixelssum1, pixelssum2); 
1036 
temp3 = vec_sra(temp3, vctwo); 
1037  
1038 
pixelssum3 = vec_add(pixelssum4, vcone); 
1039 
pixelssum1 = vec_add(pixelssum2, vcone); 
1040  
1041 
blockv = vec_packsu(temp3, temp4); 
1042  
1043 
vec_st(blockv, 0, block);

1044  
1045 
block += line_size; 
1046 
pixels += line_size; 
1047 
} 
1048  
1049 
POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);

1050 
} 
1051  
1052 
int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
1053 
POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);

1054 
int sum;

1055 
register const vector unsigned char vzero = 
1056 
(const vector unsigned char)vec_splat_u8(0); 
1057 
register vector signed short temp0, temp1, temp2, temp3, temp4, 
1058 
temp5, temp6, temp7; 
1059 
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);

1060 
{ 
1061 
register const vector signed short vprod1 =(const vector signed short) 
1062 
AVV( 1,1, 1,1, 1,1, 1,1); 
1063 
register const vector signed short vprod2 =(const vector signed short) 
1064 
AVV( 1, 1,1,1, 1, 1,1,1); 
1065 
register const vector signed short vprod3 =(const vector signed short) 
1066 
AVV( 1, 1, 1, 1,1,1,1,1); 
1067 
register const vector unsigned char perm1 = (const vector unsigned char) 
1068 
AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
1069 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); 
1070 
register const vector unsigned char perm2 = (const vector unsigned char) 
1071 
AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
1072 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); 
1073 
register const vector unsigned char perm3 = (const vector unsigned char) 
1074 
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
1075 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); 
1076  
1077 
#define ONEITERBUTTERFLY(i, res) \

1078 
{ \ 
1079 
register vector unsigned char src1, src2, srcO; \ 
1080 
register vector unsigned char dst1, dst2, dstO; \ 
1081 
register vector signed short srcV, dstV; \ 
1082 
register vector signed short but0, but1, but2, op1, op2, op3; \ 
1083 
src1 = vec_ld(stride * i, src); \ 
1084 
src2 = vec_ld((stride * i) + 15, src); \

1085 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1086 
dst1 = vec_ld(stride * i, dst); \ 
1087 
dst2 = vec_ld((stride * i) + 15, dst); \

1088 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1089 
/* promote the unsigned chars to signed shorts */ \

1090 
/* we're in the 8x8 function, we only care for the first 8 */ \

1091 
srcV = \ 
1092 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1093 
(vector signed char)srcO); \ 
1094 
dstV = \ 
1095 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1096 
(vector signed char)dstO); \ 
1097 
/* subtractions inside the first butterfly */ \

1098 
but0 = vec_sub(srcV, dstV); \ 
1099 
op1 = vec_perm(but0, but0, perm1); \ 
1100 
but1 = vec_mladd(but0, vprod1, op1); \ 
1101 
op2 = vec_perm(but1, but1, perm2); \ 
1102 
but2 = vec_mladd(but1, vprod2, op2); \ 
1103 
op3 = vec_perm(but2, but2, perm3); \ 
1104 
res = vec_mladd(but2, vprod3, op3); \ 
1105 
} 
1106 
ONEITERBUTTERFLY(0, temp0);

1107 
ONEITERBUTTERFLY(1, temp1);

1108 
ONEITERBUTTERFLY(2, temp2);

1109 
ONEITERBUTTERFLY(3, temp3);

1110 
ONEITERBUTTERFLY(4, temp4);

1111 
ONEITERBUTTERFLY(5, temp5);

1112 
ONEITERBUTTERFLY(6, temp6);

1113 
ONEITERBUTTERFLY(7, temp7);

1114 
} 
1115 
#undef ONEITERBUTTERFLY

1116 
{ 
1117 
register vector signed int vsum; 
1118 
register vector signed short line0 = vec_add(temp0, temp1); 
1119 
register vector signed short line1 = vec_sub(temp0, temp1); 
1120 
register vector signed short line2 = vec_add(temp2, temp3); 
1121 
register vector signed short line3 = vec_sub(temp2, temp3); 
1122 
register vector signed short line4 = vec_add(temp4, temp5); 
1123 
register vector signed short line5 = vec_sub(temp4, temp5); 
1124 
register vector signed short line6 = vec_add(temp6, temp7); 
1125 
register vector signed short line7 = vec_sub(temp6, temp7); 
1126  
1127 
register vector signed short line0B = vec_add(line0, line2); 
1128 
register vector signed short line2B = vec_sub(line0, line2); 
1129 
register vector signed short line1B = vec_add(line1, line3); 
1130 
register vector signed short line3B = vec_sub(line1, line3); 
1131 
register vector signed short line4B = vec_add(line4, line6); 
1132 
register vector signed short line6B = vec_sub(line4, line6); 
1133 
register vector signed short line5B = vec_add(line5, line7); 
1134 
register vector signed short line7B = vec_sub(line5, line7); 
1135  
1136 
register vector signed short line0C = vec_add(line0B, line4B); 
1137 
register vector signed short line4C = vec_sub(line0B, line4B); 
1138 
register vector signed short line1C = vec_add(line1B, line5B); 
1139 
register vector signed short line5C = vec_sub(line1B, line5B); 
1140 
register vector signed short line2C = vec_add(line2B, line6B); 
1141 
register vector signed short line6C = vec_sub(line2B, line6B); 
1142 
register vector signed short line3C = vec_add(line3B, line7B); 
1143 
register vector signed short line7C = vec_sub(line3B, line7B); 
1144  
1145 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1146 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1147 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1148 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1149 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1150 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1151 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1152 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1153 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1154 
vsum = vec_splat(vsum, 3);

1155 
vec_ste(vsum, 0, &sum);

1156 
} 
1157 
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);

1158 
return sum;

1159 
} 
1160  
1161 
/*

1162 
16x8 works with 16 elements ; it allows to avoid replicating

1163 
loads, and give the compiler more rooms for scheduling.

1164 
It's only used from inside hadamard8_diff16_altivec.

1165 

1166 
Unfortunately, it seems gcc3.3 is a bit dumb, and

1167 
the compiled code has a LOT of spill code, it seems

1168 
gcc (unlike xlc) cannot keep everything in registers

1169 
by itself. The following code include handmade

1170 
registers allocation. It's not clean, but on

1171 
a 7450 the resulting code is much faster (best case

1172 
fall from 700+ cycles to 550).

1173 

1174 
xlc doesn't add spill code, but it doesn't know how to

1175 
schedule for the 7450, and its code isn't much faster than

1176 
gcc3.3 on the 7450 (but uses 25% less instructions...)

1177 

1178 
On the 970, the handmade RA is still a win (around 690

1179 
vs. around 780), but xlc goes to around 660 on the

1180 
regular C code...

1181 
*/

1182  
1183 
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { 
1184 
int sum;

1185 
register vector signed short 
1186 
temp0 REG_v(v0), 
1187 
temp1 REG_v(v1), 
1188 
temp2 REG_v(v2), 
1189 
temp3 REG_v(v3), 
1190 
temp4 REG_v(v4), 
1191 
temp5 REG_v(v5), 
1192 
temp6 REG_v(v6), 
1193 
temp7 REG_v(v7); 
1194 
register vector signed short 
1195 
temp0S REG_v(v8), 
1196 
temp1S REG_v(v9), 
1197 
temp2S REG_v(v10), 
1198 
temp3S REG_v(v11), 
1199 
temp4S REG_v(v12), 
1200 
temp5S REG_v(v13), 
1201 
temp6S REG_v(v14), 
1202 
temp7S REG_v(v15); 
1203 
register const vector unsigned char vzero REG_v(v31)= 
1204 
(const vector unsigned char)vec_splat_u8(0); 
1205 
{ 
1206 
register const vector signed short vprod1 REG_v(v16)= 
1207 
(const vector signed short)AVV( 1,1, 1,1, 1,1, 1,1); 
1208 
register const vector signed short vprod2 REG_v(v17)= 
1209 
(const vector signed short)AVV( 1, 1,1,1, 1, 1,1,1); 
1210 
register const vector signed short vprod3 REG_v(v18)= 
1211 
(const vector signed short)AVV( 1, 1, 1, 1,1,1,1,1); 
1212 
register const vector unsigned char perm1 REG_v(v19)= 
1213 
(const vector unsigned char) 
1214 
AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
1215 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); 
1216 
register const vector unsigned char perm2 REG_v(v20)= 
1217 
(const vector unsigned char) 
1218 
AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
1219 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); 
1220 
register const vector unsigned char perm3 REG_v(v21)= 
1221 
(const vector unsigned char) 
1222 
AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
1223 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); 
1224  
1225 
#define ONEITERBUTTERFLY(i, res1, res2) \

1226 
{ \ 
1227 
register vector unsigned char src1 REG_v(v22), \ 
1228 
src2 REG_v(v23), \ 
1229 
dst1 REG_v(v24), \ 
1230 
dst2 REG_v(v25), \ 
1231 
srcO REG_v(v22), \ 
1232 
dstO REG_v(v23); \ 
1233 
\ 
1234 
register vector signed short srcV REG_v(v24), \ 
1235 
dstV REG_v(v25), \ 
1236 
srcW REG_v(v26), \ 
1237 
dstW REG_v(v27), \ 
1238 
but0 REG_v(v28), \ 
1239 
but0S REG_v(v29), \ 
1240 
op1 REG_v(v30), \ 
1241 
but1 REG_v(v22), \ 
1242 
op1S REG_v(v23), \ 
1243 
but1S REG_v(v24), \ 
1244 
op2 REG_v(v25), \ 
1245 
but2 REG_v(v26), \ 
1246 
op2S REG_v(v27), \ 
1247 
but2S REG_v(v28), \ 
1248 
op3 REG_v(v29), \ 
1249 
op3S REG_v(v30); \ 
1250 
\ 
1251 
src1 = vec_ld(stride * i, src); \ 
1252 
src2 = vec_ld((stride * i) + 16, src); \

1253 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1254 
dst1 = vec_ld(stride * i, dst); \ 
1255 
dst2 = vec_ld((stride * i) + 16, dst); \

1256 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1257 
/* promote the unsigned chars to signed shorts */ \

1258 
srcV = \ 
1259 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1260 
(vector signed char)srcO); \ 
1261 
dstV = \ 
1262 
(vector signed short)vec_mergeh((vector signed char)vzero, \ 
1263 
(vector signed char)dstO); \ 
1264 
srcW = \ 
1265 
(vector signed short)vec_mergel((vector signed char)vzero, \ 
1266 
(vector signed char)srcO); \ 
1267 
dstW = \ 
1268 
(vector signed short)vec_mergel((vector signed char)vzero, \ 
1269 
(vector signed char)dstO); \ 
1270 
/* subtractions inside the first butterfly */ \

1271 
but0 = vec_sub(srcV, dstV); \ 
1272 
but0S = vec_sub(srcW, dstW); \ 
1273 
op1 = vec_perm(but0, but0, perm1); \ 
1274 
but1 = vec_mladd(but0, vprod1, op1); \ 
1275 
op1S = vec_perm(but0S, but0S, perm1); \ 
1276 
but1S = vec_mladd(but0S, vprod1, op1S); \ 
1277 
op2 = vec_perm(but1, but1, perm2); \ 
1278 
but2 = vec_mladd(but1, vprod2, op2); \ 
1279 
op2S = vec_perm(but1S, but1S, perm2); \ 
1280 
but2S = vec_mladd(but1S, vprod2, op2S); \ 
1281 
op3 = vec_perm(but2, but2, perm3); \ 
1282 
res1 = vec_mladd(but2, vprod3, op3); \ 
1283 
op3S = vec_perm(but2S, but2S, perm3); \ 
1284 
res2 = vec_mladd(but2S, vprod3, op3S); \ 
1285 
} 
1286 
ONEITERBUTTERFLY(0, temp0, temp0S);

1287 
ONEITERBUTTERFLY(1, temp1, temp1S);

1288 
ONEITERBUTTERFLY(2, temp2, temp2S);

1289 
ONEITERBUTTERFLY(3, temp3, temp3S);

1290 
ONEITERBUTTERFLY(4, temp4, temp4S);

1291 
ONEITERBUTTERFLY(5, temp5, temp5S);

1292 
ONEITERBUTTERFLY(6, temp6, temp6S);

1293 
ONEITERBUTTERFLY(7, temp7, temp7S);

1294 
} 
1295 
#undef ONEITERBUTTERFLY

1296 
{ 
1297 
register vector signed int vsum; 
1298 
register vector signed short line0S, line1S, line2S, line3S, line4S, 
1299 
line5S, line6S, line7S, line0BS,line2BS, 
1300 
line1BS,line3BS,line4BS,line6BS,line5BS, 
1301 
line7BS,line0CS,line4CS,line1CS,line5CS, 
1302 
line2CS,line6CS,line3CS,line7CS; 
1303  
1304 
register vector signed short line0 = vec_add(temp0, temp1); 
1305 
register vector signed short line1 = vec_sub(temp0, temp1); 
1306 
register vector signed short line2 = vec_add(temp2, temp3); 
1307 
register vector signed short line3 = vec_sub(temp2, temp3); 
1308 
register vector signed short line4 = vec_add(temp4, temp5); 
1309 
register vector signed short line5 = vec_sub(temp4, temp5); 
1310 
register vector signed short line6 = vec_add(temp6, temp7); 
1311 
register vector signed short line7 = vec_sub(temp6, temp7); 
1312  
1313 
register vector signed short line0B = vec_add(line0, line2); 
1314 
register vector signed short line2B = vec_sub(line0, line2); 
1315 
register vector signed short line1B = vec_add(line1, line3); 
1316 
register vector signed short line3B = vec_sub(line1, line3); 
1317 
register vector signed short line4B = vec_add(line4, line6); 
1318 
register vector signed short line6B = vec_sub(line4, line6); 
1319 
register vector signed short line5B = vec_add(line5, line7); 
1320 
register vector signed short line7B = vec_sub(line5, line7); 
1321  
1322 
register vector signed short line0C = vec_add(line0B, line4B); 
1323 
register vector signed short line4C = vec_sub(line0B, line4B); 
1324 
register vector signed short line1C = vec_add(line1B, line5B); 
1325 
register vector signed short line5C = vec_sub(line1B, line5B); 
1326 
register vector signed short line2C = vec_add(line2B, line6B); 
1327 
register vector signed short line6C = vec_sub(line2B, line6B); 
1328 
register vector signed short line3C = vec_add(line3B, line7B); 
1329 
register vector signed short line7C = vec_sub(line3B, line7B); 
1330  
1331 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1332 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1333 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1334 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1335 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1336 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1337 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1338 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1339  
1340 
line0S = vec_add(temp0S, temp1S); 
1341 
line1S = vec_sub(temp0S, temp1S); 
1342 
line2S = vec_add(temp2S, temp3S); 
1343 
line3S = vec_sub(temp2S, temp3S); 
1344 
line4S = vec_add(temp4S, temp5S); 
1345 
line5S = vec_sub(temp4S, temp5S); 
1346 
line6S = vec_add(temp6S, temp7S); 
1347 
line7S = vec_sub(temp6S, temp7S); 
1348  
1349 
line0BS = vec_add(line0S, line2S); 
1350 
line2BS = vec_sub(line0S, line2S); 
1351 
line1BS = vec_add(line1S, line3S); 
1352 
line3BS = vec_sub(line1S, line3S); 
1353 
line4BS = vec_add(line4S, line6S); 
1354 
line6BS = vec_sub(line4S, line6S); 
1355 
line5BS = vec_add(line5S, line7S); 
1356 
line7BS = vec_sub(line5S, line7S); 
1357  
1358 
line0CS = vec_add(line0BS, line4BS); 
1359 
line4CS = vec_sub(line0BS, line4BS); 
1360 
line1CS = vec_add(line1BS, line5BS); 
1361 
line5CS = vec_sub(line1BS, line5BS); 
1362 
line2CS = vec_add(line2BS, line6BS); 
1363 
line6CS = vec_sub(line2BS, line6BS); 
1364 
line3CS = vec_add(line3BS, line7BS); 
1365 
line7CS = vec_sub(line3BS, line7BS); 
1366  
1367 
vsum = vec_sum4s(vec_abs(line0CS), vsum); 
1368 
vsum = vec_sum4s(vec_abs(line1CS), vsum); 
1369 
vsum = vec_sum4s(vec_abs(line2CS), vsum); 
1370 
vsum = vec_sum4s(vec_abs(line3CS), vsum); 
1371 
vsum = vec_sum4s(vec_abs(line4CS), vsum); 
1372 
vsum = vec_sum4s(vec_abs(line5CS), vsum); 
1373 
vsum = vec_sum4s(vec_abs(line6CS), vsum); 
1374 
vsum = vec_sum4s(vec_abs(line7CS), vsum); 
1375 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1376 
vsum = vec_splat(vsum, 3);

1377 
vec_ste(vsum, 0, &sum);

1378 
} 
1379 
return sum;

1380 
} 
1381  
1382 
int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
1383 
POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);

1384 
int score;

1385 
POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);

1386 
score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1387 
if (h==16) { 
1388 
dst += 8*stride;

1389 
src += 8*stride;

1390 
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1391 
} 
1392 
POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);

1393 
return score;

1394 
} 
1395  
1396 
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, 
1397 
int blocksize)

1398 
{ 
1399 
int i;

1400 
vector float m, a;

1401 
vector bool int t0, t1; 
1402 
const vector unsigned int v_31 = //XXX 
1403 
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); 
1404 
for(i=0; i<blocksize; i+=4) { 
1405 
m = vec_ld(0, mag+i);

1406 
a = vec_ld(0, ang+i);

1407 
t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); 
1408 
t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); 
1409 
a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); 
1410 
t0 = (vector bool int)vec_and(a, t1); 
1411 
t1 = (vector bool int)vec_andc(a, t1); 
1412 
a = vec_sub(m, (vector float)t1);

1413 
m = vec_add(m, (vector float)t0);

1414 
vec_stl(a, 0, ang+i);

1415 
vec_stl(m, 0, mag+i);

1416 
} 
1417 
} 
1418  
1419 
/* next one assumes that ((line_size % 8) == 0) */

1420 
void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
1421 
{ 
1422 
POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);

1423 
register int i; 
1424 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
1425 
register vector unsigned char blockv, temp1, temp2, blocktemp; 
1426 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
1427  
1428 
register const vector unsigned char vczero = (const vector unsigned char) 
1429 
vec_splat_u8(0);

1430 
register const vector unsigned short vctwo = (const vector unsigned short) 
1431 
vec_splat_u16(2);

1432  
1433 
temp1 = vec_ld(0, pixels);

1434 
temp2 = vec_ld(16, pixels);

1435 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1436 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
1437 
pixelsv2 = temp2; 
1438 
} else {

1439 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1440 
} 
1441 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1442 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1443 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1444 
(vector unsigned short)pixelsv2); 
1445 
pixelssum1 = vec_add(pixelssum1, vctwo); 
1446  
1447 
POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);

1448 
for (i = 0; i < h ; i++) { 
1449 
int rightside = ((unsigned long)block & 0x0000000F); 
1450 
blockv = vec_ld(0, block);

1451  
1452 
temp1 = vec_ld(line_size, pixels); 
1453 
temp2 = vec_ld(line_size + 16, pixels);

1454 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1455 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) 
1456 
{ 
1457 
pixelsv2 = temp2; 
1458 
} else {

1459 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1460 
} 
1461  
1462 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1463 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1464 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1465 
(vector unsigned short)pixelsv2); 
1466 
temp3 = vec_add(pixelssum1, pixelssum2); 
1467 
temp3 = vec_sra(temp3, vctwo); 
1468 
pixelssum1 = vec_add(pixelssum2, vctwo); 
1469 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
1470  
1471 
if (rightside) {

1472 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
1473 
} else {

1474 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
1475 
} 
1476  
1477 
blockv = vec_avg(blocktemp, blockv); 
1478 
vec_st(blockv, 0, block);

1479  
1480 
block += line_size; 
1481 
pixels += line_size; 
1482 
} 
1483  
1484 
POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);

1485 
} 
1486  
1487 
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)

1488 
{ 
1489 
c>pix_abs[0][1] = sad16_x2_altivec; 
1490 
c>pix_abs[0][2] = sad16_y2_altivec; 
1491 
c>pix_abs[0][3] = sad16_xy2_altivec; 
1492 
c>pix_abs[0][0] = sad16_altivec; 
1493 
c>pix_abs[1][0] = sad8_altivec; 
1494 
c>sad[0]= sad16_altivec;

1495 
c>sad[1]= sad8_altivec;

1496 
c>pix_norm1 = pix_norm1_altivec; 
1497 
c>sse[1]= sse8_altivec;

1498 
c>sse[0]= sse16_altivec;

1499 
c>pix_sum = pix_sum_altivec; 
1500 
c>diff_pixels = diff_pixels_altivec; 
1501 
c>get_pixels = get_pixels_altivec; 
1502 
c>add_bytes= add_bytes_altivec; 
1503 
c>put_pixels_tab[0][0] = put_pixels16_altivec; 
1504 
/* the two functions do the same thing, so use the same code */

1505 
c>put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 
1506 
c>avg_pixels_tab[0][0] = avg_pixels16_altivec; 
1507 
c>avg_pixels_tab[1][0] = avg_pixels8_altivec; 
1508 
c>avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 
1509 
c>put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 
1510 
c>put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 
1511 
c>put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 
1512 
c>put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 
1513  
1514 
c>hadamard8_diff[0] = hadamard8_diff16_altivec;

1515 
c>hadamard8_diff[1] = hadamard8_diff8x8_altivec;

1516 
if (ENABLE_VORBIS_DECODER)

1517 
c>vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; 
1518 
} 