ffmpeg / libavcodec / ppc / dsputil_altivec.c @ 3af1fe82
History  View  Annotate  Download (55 KB)
1 
/*


2 
* Copyright (c) 2002 Brian Foley

3 
* Copyright (c) 2002 Dieter Shirley

4 
* Copyright (c) 20032004 Romain Dolbeau <romain@dolbeau.org>

5 
*

6 
* This file is part of FFmpeg.

7 
*

8 
* FFmpeg is free software; you can redistribute it and/or

9 
* modify it under the terms of the GNU Lesser General Public

10 
* License as published by the Free Software Foundation; either

11 
* version 2.1 of the License, or (at your option) any later version.

12 
*

13 
* FFmpeg is distributed in the hope that it will be useful,

14 
* but WITHOUT ANY WARRANTY; without even the implied warranty of

15 
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16 
* Lesser General Public License for more details.

17 
*

18 
* You should have received a copy of the GNU Lesser General Public

19 
* License along with FFmpeg; if not, write to the Free Software

20 
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 021101301 USA

21 
*/

22  
23 
#include "config.h" 
24 
#if HAVE_ALTIVEC_H

25 
#include <altivec.h> 
26 
#endif

27 
#include "libavcodec/dsputil.h" 
28 
#include "util_altivec.h" 
29 
#include "types_altivec.h" 
30 
#include "dsputil_altivec.h" 
31  
32 
static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
33 
{ 
34 
int i;

35 
int s;

36 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
37 
vector unsigned char *tv; 
38 
vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 
39 
vector unsigned int sad; 
40 
vector signed int sumdiffs; 
41  
42 
s = 0;

43 
sad = (vector unsigned int)vec_splat_u32(0); 
44 
for (i = 0; i < h; i++) { 
45 
/* Read unaligned pixels into our vectors. The vectors are as follows:

46 
pix1v: pix1[0]pix1[15]

47 
pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16] */

48 
tv = (vector unsigned char *) pix1; 
49 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
50  
51 
tv = (vector unsigned char *) &pix2[0]; 
52 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
53  
54 
tv = (vector unsigned char *) &pix2[1]; 
55 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
56  
57 
/* Calculate the average vector */

58 
avgv = vec_avg(pix2v, pix2iv); 
59  
60 
/* Calculate a sum of abs differences vector */

61 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
62  
63 
/* Add each 4 pixel group together and put 4 results into sad */

64 
sad = vec_sum4s(t5, sad); 
65  
66 
pix1 += line_size; 
67 
pix2 += line_size; 
68 
} 
69 
/* Sum up the four partial sums, and put the result into s */

70 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
71 
sumdiffs = vec_splat(sumdiffs, 3);

72 
vec_ste(sumdiffs, 0, &s);

73  
74 
return s;

75 
} 
76  
77 
static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
78 
{ 
79 
int i;

80 
int s;

81 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
82 
vector unsigned char *tv; 
83 
vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 
84 
vector unsigned int sad; 
85 
vector signed int sumdiffs; 
86 
uint8_t *pix3 = pix2 + line_size; 
87  
88 
s = 0;

89 
sad = (vector unsigned int)vec_splat_u32(0); 
90  
91 
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one

92 
iteration becomes pix2 in the next iteration. We can use this

93 
fact to avoid a potentially expensive unaligned read, each

94 
time around the loop.

95 
Read unaligned pixels into our vectors. The vectors are as follows:

96 
pix2v: pix2[0]pix2[15]

97 
Split the pixel vectors into shorts */

98 
tv = (vector unsigned char *) &pix2[0]; 
99 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
100  
101 
for (i = 0; i < h; i++) { 
102 
/* Read unaligned pixels into our vectors. The vectors are as follows:

103 
pix1v: pix1[0]pix1[15]

104 
pix3v: pix3[0]pix3[15] */

105 
tv = (vector unsigned char *) pix1; 
106 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
107  
108 
tv = (vector unsigned char *) &pix3[0]; 
109 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
110  
111 
/* Calculate the average vector */

112 
avgv = vec_avg(pix2v, pix3v); 
113  
114 
/* Calculate a sum of abs differences vector */

115 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
116  
117 
/* Add each 4 pixel group together and put 4 results into sad */

118 
sad = vec_sum4s(t5, sad); 
119  
120 
pix1 += line_size; 
121 
pix2v = pix3v; 
122 
pix3 += line_size; 
123  
124 
} 
125  
126 
/* Sum up the four partial sums, and put the result into s */

127 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
128 
sumdiffs = vec_splat(sumdiffs, 3);

129 
vec_ste(sumdiffs, 0, &s);

130 
return s;

131 
} 
132  
133 
static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
134 
{ 
135 
int i;

136 
int s;

137 
uint8_t *pix3 = pix2 + line_size; 
138 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
139 
const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); 
140 
vector unsigned char *tv, avgv, t5; 
141 
vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 
142 
vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 
143 
vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 
144 
vector unsigned short avghv, avglv; 
145 
vector unsigned short t1, t2, t3, t4; 
146 
vector unsigned int sad; 
147 
vector signed int sumdiffs; 
148  
149 
sad = (vector unsigned int)vec_splat_u32(0); 
150  
151 
s = 0;

152  
153 
/* Due to the fact that pix3 = pix2 + line_size, the pix3 of one

154 
iteration becomes pix2 in the next iteration. We can use this

155 
fact to avoid a potentially expensive unaligned read, as well

156 
as some splitting, and vector addition each time around the loop.

157 
Read unaligned pixels into our vectors. The vectors are as follows:

158 
pix2v: pix2[0]pix2[15] pix2iv: pix2[1]pix2[16]

159 
Split the pixel vectors into shorts */

160 
tv = (vector unsigned char *) &pix2[0]; 
161 
pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 
162  
163 
tv = (vector unsigned char *) &pix2[1]; 
164 
pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 
165  
166 
pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 
167 
pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 
168 
pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 
169 
pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 
170 
t1 = vec_add(pix2hv, pix2ihv); 
171 
t2 = vec_add(pix2lv, pix2ilv); 
172  
173 
for (i = 0; i < h; i++) { 
174 
/* Read unaligned pixels into our vectors. The vectors are as follows:

175 
pix1v: pix1[0]pix1[15]

176 
pix3v: pix3[0]pix3[15] pix3iv: pix3[1]pix3[16] */

177 
tv = (vector unsigned char *) pix1; 
178 
pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 
179  
180 
tv = (vector unsigned char *) &pix3[0]; 
181 
pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 
182  
183 
tv = (vector unsigned char *) &pix3[1]; 
184 
pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 
185  
186 
/* Note that AltiVec does have vec_avg, but this works on vector pairs

187 
and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding

188 
would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.

189 
Instead, we have to split the pixel vectors into vectors of shorts,

190 
and do the averaging by hand. */

191  
192 
/* Split the pixel vectors into shorts */

193 
pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 
194 
pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 
195 
pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 
196 
pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 
197  
198 
/* Do the averaging on them */

199 
t3 = vec_add(pix3hv, pix3ihv); 
200 
t4 = vec_add(pix3lv, pix3ilv); 
201  
202 
avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 
203 
avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 
204  
205 
/* Pack the shorts back into a result */

206 
avgv = vec_pack(avghv, avglv); 
207  
208 
/* Calculate a sum of abs differences vector */

209 
t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 
210  
211 
/* Add each 4 pixel group together and put 4 results into sad */

212 
sad = vec_sum4s(t5, sad); 
213  
214 
pix1 += line_size; 
215 
pix3 += line_size; 
216 
/* Transfer the calculated values for pix3 into pix2 */

217 
t1 = t3; 
218 
t2 = t4; 
219 
} 
220 
/* Sum up the four partial sums, and put the result into s */

221 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
222 
sumdiffs = vec_splat(sumdiffs, 3);

223 
vec_ste(sumdiffs, 0, &s);

224  
225 
return s;

226 
} 
227  
228 
static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
229 
{ 
230 
int i;

231 
int s;

232 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
233 
vector unsigned char perm1, perm2, pix1v_low, pix1v_high, pix2v_low, pix2v_high; 
234 
vector unsigned char t1, t2, t3,t4, t5; 
235 
vector unsigned int sad; 
236 
vector signed int sumdiffs; 
237  
238 
sad = (vector unsigned int)vec_splat_u32(0); 
239  
240  
241 
for (i = 0; i < h; i++) { 
242 
/* Read potentially unaligned pixels into t1 and t2 */

243 
perm1 = vec_lvsl(0, pix1);

244 
pix1v_high = vec_ld( 0, pix1);

245 
pix1v_low = vec_ld(15, pix1);

246 
perm2 = vec_lvsl(0, pix2);

247 
pix2v_high = vec_ld( 0, pix2);

248 
pix2v_low = vec_ld(15, pix2);

249 
t1 = vec_perm(pix1v_high, pix1v_low, perm1); 
250 
t2 = vec_perm(pix2v_high, pix2v_low, perm2); 
251  
252 
/* Calculate a sum of abs differences vector */

253 
t3 = vec_max(t1, t2); 
254 
t4 = vec_min(t1, t2); 
255 
t5 = vec_sub(t3, t4); 
256  
257 
/* Add each 4 pixel group together and put 4 results into sad */

258 
sad = vec_sum4s(t5, sad); 
259  
260 
pix1 += line_size; 
261 
pix2 += line_size; 
262 
} 
263  
264 
/* Sum up the four partial sums, and put the result into s */

265 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
266 
sumdiffs = vec_splat(sumdiffs, 3);

267 
vec_ste(sumdiffs, 0, &s);

268  
269 
return s;

270 
} 
271  
272 
static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
273 
{ 
274 
int i;

275 
int s;

276 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
277 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
278 
vector unsigned char t1, t2, t3,t4, t5; 
279 
vector unsigned int sad; 
280 
vector signed int sumdiffs; 
281  
282 
sad = (vector unsigned int)vec_splat_u32(0); 
283  
284 
permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 
285  
286 
for (i = 0; i < h; i++) { 
287 
/* Read potentially unaligned pixels into t1 and t2

288 
Since we're reading 16 pixels, and actually only want 8,

289 
mask out the last 8 pixels. The 0s don't change the sum. */

290 
perm1 = vec_lvsl(0, pix1);

291 
pix1v = (vector unsigned char *) pix1; 
292 
perm2 = vec_lvsl(0, pix2);

293 
pix2v = (vector unsigned char *) pix2; 
294 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
295 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
296  
297 
/* Calculate a sum of abs differences vector */

298 
t3 = vec_max(t1, t2); 
299 
t4 = vec_min(t1, t2); 
300 
t5 = vec_sub(t3, t4); 
301  
302 
/* Add each 4 pixel group together and put 4 results into sad */

303 
sad = vec_sum4s(t5, sad); 
304  
305 
pix1 += line_size; 
306 
pix2 += line_size; 
307 
} 
308  
309 
/* Sum up the four partial sums, and put the result into s */

310 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
311 
sumdiffs = vec_splat(sumdiffs, 3);

312 
vec_ste(sumdiffs, 0, &s);

313  
314 
return s;

315 
} 
316  
317 
static int pix_norm1_altivec(uint8_t *pix, int line_size) 
318 
{ 
319 
int i;

320 
int s;

321 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
322 
vector unsigned char *tv; 
323 
vector unsigned char pixv; 
324 
vector unsigned int sv; 
325 
vector signed int sum; 
326  
327 
sv = (vector unsigned int)vec_splat_u32(0); 
328  
329 
s = 0;

330 
for (i = 0; i < 16; i++) { 
331 
/* Read in the potentially unaligned pixels */

332 
tv = (vector unsigned char *) pix; 
333 
pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 
334  
335 
/* Square the values, and add them to our sum */

336 
sv = vec_msum(pixv, pixv, sv); 
337  
338 
pix += line_size; 
339 
} 
340 
/* Sum up the four partial sums, and put the result into s */

341 
sum = vec_sums((vector signed int) sv, (vector signed int) zero); 
342 
sum = vec_splat(sum, 3);

343 
vec_ste(sum, 0, &s);

344  
345 
return s;

346 
} 
347  
348 
/**

349 
* Sum of Squared Errors for a 8x8 block.

350 
* AltiVecenhanced.

351 
* It's the sad8_altivec code above w/ squaring added.

352 
*/

353 
static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
354 
{ 
355 
int i;

356 
int s;

357 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
358 
vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 
359 
vector unsigned char t1, t2, t3,t4, t5; 
360 
vector unsigned int sum; 
361 
vector signed int sumsqr; 
362  
363 
sum = (vector unsigned int)vec_splat_u32(0); 
364  
365 
permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 
366  
367  
368 
for (i = 0; i < h; i++) { 
369 
/* Read potentially unaligned pixels into t1 and t2

370 
Since we're reading 16 pixels, and actually only want 8,

371 
mask out the last 8 pixels. The 0s don't change the sum. */

372 
perm1 = vec_lvsl(0, pix1);

373 
pix1v = (vector unsigned char *) pix1; 
374 
perm2 = vec_lvsl(0, pix2);

375 
pix2v = (vector unsigned char *) pix2; 
376 
t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 
377 
t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 
378  
379 
/* Since we want to use unsigned chars, we can take advantage

380 
of the fact that abs(ab)^2 = (ab)^2. */

381  
382 
/* Calculate abs differences vector */

383 
t3 = vec_max(t1, t2); 
384 
t4 = vec_min(t1, t2); 
385 
t5 = vec_sub(t3, t4); 
386  
387 
/* Square the values and add them to our sum */

388 
sum = vec_msum(t5, t5, sum); 
389  
390 
pix1 += line_size; 
391 
pix2 += line_size; 
392 
} 
393  
394 
/* Sum up the four partial sums, and put the result into s */

395 
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
396 
sumsqr = vec_splat(sumsqr, 3);

397 
vec_ste(sumsqr, 0, &s);

398  
399 
return s;

400 
} 
401  
402 
/**

403 
* Sum of Squared Errors for a 16x16 block.

404 
* AltiVecenhanced.

405 
* It's the sad16_altivec code above w/ squaring added.

406 
*/

407 
static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 
408 
{ 
409 
int i;

410 
int s;

411 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
412 
vector unsigned char perm1, perm2, *pix1v, *pix2v; 
413 
vector unsigned char t1, t2, t3,t4, t5; 
414 
vector unsigned int sum; 
415 
vector signed int sumsqr; 
416  
417 
sum = (vector unsigned int)vec_splat_u32(0); 
418  
419 
for (i = 0; i < h; i++) { 
420 
/* Read potentially unaligned pixels into t1 and t2 */

421 
perm1 = vec_lvsl(0, pix1);

422 
pix1v = (vector unsigned char *) pix1; 
423 
perm2 = vec_lvsl(0, pix2);

424 
pix2v = (vector unsigned char *) pix2; 
425 
t1 = vec_perm(pix1v[0], pix1v[1], perm1); 
426 
t2 = vec_perm(pix2v[0], pix2v[1], perm2); 
427  
428 
/* Since we want to use unsigned chars, we can take advantage

429 
of the fact that abs(ab)^2 = (ab)^2. */

430  
431 
/* Calculate abs differences vector */

432 
t3 = vec_max(t1, t2); 
433 
t4 = vec_min(t1, t2); 
434 
t5 = vec_sub(t3, t4); 
435  
436 
/* Square the values and add them to our sum */

437 
sum = vec_msum(t5, t5, sum); 
438  
439 
pix1 += line_size; 
440 
pix2 += line_size; 
441 
} 
442  
443 
/* Sum up the four partial sums, and put the result into s */

444 
sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 
445 
sumsqr = vec_splat(sumsqr, 3);

446 
vec_ste(sumsqr, 0, &s);

447  
448 
return s;

449 
} 
450  
451 
static int pix_sum_altivec(uint8_t * pix, int line_size) 
452 
{ 
453 
const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 
454 
vector unsigned char perm, *pixv; 
455 
vector unsigned char t1; 
456 
vector unsigned int sad; 
457 
vector signed int sumdiffs; 
458  
459 
int i;

460 
int s;

461  
462 
sad = (vector unsigned int)vec_splat_u32(0); 
463  
464 
for (i = 0; i < 16; i++) { 
465 
/* Read the potentially unaligned 16 pixels into t1 */

466 
perm = vec_lvsl(0, pix);

467 
pixv = (vector unsigned char *) pix; 
468 
t1 = vec_perm(pixv[0], pixv[1], perm); 
469  
470 
/* Add each 4 pixel group together and put 4 results into sad */

471 
sad = vec_sum4s(t1, sad); 
472  
473 
pix += line_size; 
474 
} 
475  
476 
/* Sum up the four partial sums, and put the result into s */

477 
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 
478 
sumdiffs = vec_splat(sumdiffs, 3);

479 
vec_ste(sumdiffs, 0, &s);

480  
481 
return s;

482 
} 
483  
484 
static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 
485 
{ 
486 
int i;

487 
vector unsigned char perm, bytes, *pixv; 
488 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
489 
vector signed short shorts; 
490  
491 
for (i = 0; i < 8; i++) { 
492 
// Read potentially unaligned pixels.

493 
// We're reading 16 pixels, and actually only want 8,

494 
// but we simply ignore the extras.

495 
perm = vec_lvsl(0, pixels);

496 
pixv = (vector unsigned char *) pixels; 
497 
bytes = vec_perm(pixv[0], pixv[1], perm); 
498  
499 
// convert the bytes into shorts

500 
shorts = (vector signed short)vec_mergeh(zero, bytes); 
501  
502 
// save the data to the block, we assume the block is 16byte aligned

503 
vec_st(shorts, i*16, (vector signed short*)block); 
504  
505 
pixels += line_size; 
506 
} 
507 
} 
508  
509 
static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 
510 
const uint8_t *s2, int stride) 
511 
{ 
512 
int i;

513 
vector unsigned char perm, bytes, *pixv; 
514 
const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 
515 
vector signed short shorts1, shorts2; 
516  
517 
for (i = 0; i < 4; i++) { 
518 
// Read potentially unaligned pixels

519 
// We're reading 16 pixels, and actually only want 8,

520 
// but we simply ignore the extras.

521 
perm = vec_lvsl(0, s1);

522 
pixv = (vector unsigned char *) s1; 
523 
bytes = vec_perm(pixv[0], pixv[1], perm); 
524  
525 
// convert the bytes into shorts

526 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
527  
528 
// Do the same for the second block of pixels

529 
perm = vec_lvsl(0, s2);

530 
pixv = (vector unsigned char *) s2; 
531 
bytes = vec_perm(pixv[0], pixv[1], perm); 
532  
533 
// convert the bytes into shorts

534 
shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
535  
536 
// Do the subtraction

537 
shorts1 = vec_sub(shorts1, shorts2); 
538  
539 
// save the data to the block, we assume the block is 16byte aligned

540 
vec_st(shorts1, 0, (vector signed short*)block); 
541  
542 
s1 += stride; 
543 
s2 += stride; 
544 
block += 8;

545  
546  
547 
// The code below is a copy of the code above... This is a manual

548 
// unroll.

549  
550 
// Read potentially unaligned pixels

551 
// We're reading 16 pixels, and actually only want 8,

552 
// but we simply ignore the extras.

553 
perm = vec_lvsl(0, s1);

554 
pixv = (vector unsigned char *) s1; 
555 
bytes = vec_perm(pixv[0], pixv[1], perm); 
556  
557 
// convert the bytes into shorts

558 
shorts1 = (vector signed short)vec_mergeh(zero, bytes); 
559  
560 
// Do the same for the second block of pixels

561 
perm = vec_lvsl(0, s2);

562 
pixv = (vector unsigned char *) s2; 
563 
bytes = vec_perm(pixv[0], pixv[1], perm); 
564  
565 
// convert the bytes into shorts

566 
shorts2 = (vector signed short)vec_mergeh(zero, bytes); 
567  
568 
// Do the subtraction

569 
shorts1 = vec_sub(shorts1, shorts2); 
570  
571 
// save the data to the block, we assume the block is 16byte aligned

572 
vec_st(shorts1, 0, (vector signed short*)block); 
573  
574 
s1 += stride; 
575 
s2 += stride; 
576 
block += 8;

577 
} 
578 
} 
579  
580  
581 
static void clear_block_altivec(DCTELEM *block) { 
582 
LOAD_ZERO; 
583 
vec_st(zero_s16v, 0, block);

584 
vec_st(zero_s16v, 16, block);

585 
vec_st(zero_s16v, 32, block);

586 
vec_st(zero_s16v, 48, block);

587 
vec_st(zero_s16v, 64, block);

588 
vec_st(zero_s16v, 80, block);

589 
vec_st(zero_s16v, 96, block);

590 
vec_st(zero_s16v, 112, block);

591 
} 
592  
593  
594 
static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 
595 
register int i; 
596 
register vector unsigned char vdst, vsrc; 
597  
598 
/* dst and src are 16 bytesaligned (guaranteed) */

599 
for (i = 0 ; (i + 15) < w ; i+=16) { 
600 
vdst = vec_ld(i, (unsigned char*)dst); 
601 
vsrc = vec_ld(i, (unsigned char*)src); 
602 
vdst = vec_add(vsrc, vdst); 
603 
vec_st(vdst, i, (unsigned char*)dst); 
604 
} 
605 
/* if w is not a multiple of 16 */

606 
for (; (i < w) ; i++) {

607 
dst[i] = src[i]; 
608 
} 
609 
} 
610  
611 
/* next one assumes that ((line_size % 16) == 0) */

612 
void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
613 
{ 
614 
register vector unsigned char pixelsv1, pixelsv2; 
615 
register vector unsigned char pixelsv1B, pixelsv2B; 
616 
register vector unsigned char pixelsv1C, pixelsv2C; 
617 
register vector unsigned char pixelsv1D, pixelsv2D; 
618  
619 
register vector unsigned char perm = vec_lvsl(0, pixels); 
620 
int i;

621 
register int line_size_2 = line_size << 1; 
622 
register int line_size_3 = line_size + line_size_2; 
623 
register int line_size_4 = line_size << 2; 
624  
625 
// handunrolling the loop by 4 gains about 15%

626 
// mininum execution time goes from 74 to 60 cycles

627 
// it's faster than funrollloops, but using

628 
// funrollloops w/ this is bad  74 cycles again.

629 
// all this is on a 7450, tuning for the 7450

630 
#if 0

631 
for (i = 0; i < h; i++) {

632 
pixelsv1 = vec_ld(0, pixels);

633 
pixelsv2 = vec_ld(16, pixels);

634 
vec_st(vec_perm(pixelsv1, pixelsv2, perm),

635 
0, block);

636 
pixels+=line_size;

637 
block +=line_size;

638 
}

639 
#else

640 
for (i = 0; i < h; i += 4) { 
641 
pixelsv1 = vec_ld( 0, pixels);

642 
pixelsv2 = vec_ld(15, pixels);

643 
pixelsv1B = vec_ld(line_size, pixels); 
644 
pixelsv2B = vec_ld(15 + line_size, pixels);

645 
pixelsv1C = vec_ld(line_size_2, pixels); 
646 
pixelsv2C = vec_ld(15 + line_size_2, pixels);

647 
pixelsv1D = vec_ld(line_size_3, pixels); 
648 
pixelsv2D = vec_ld(15 + line_size_3, pixels);

649 
vec_st(vec_perm(pixelsv1, pixelsv2, perm), 
650 
0, (unsigned char*)block); 
651 
vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 
652 
line_size, (unsigned char*)block); 
653 
vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 
654 
line_size_2, (unsigned char*)block); 
655 
vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 
656 
line_size_3, (unsigned char*)block); 
657 
pixels+=line_size_4; 
658 
block +=line_size_4; 
659 
} 
660 
#endif

661 
} 
662  
663 
/* next one assumes that ((line_size % 16) == 0) */

664 
#define op_avg(a,b) a = ( ((a)(b))  ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 
665 
void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
666 
{ 
667 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
668 
register vector unsigned char perm = vec_lvsl(0, pixels); 
669 
int i;

670  
671 
for (i = 0; i < h; i++) { 
672 
pixelsv1 = vec_ld( 0, pixels);

673 
pixelsv2 = vec_ld(16,pixels);

674 
blockv = vec_ld(0, block);

675 
pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 
676 
blockv = vec_avg(blockv,pixelsv); 
677 
vec_st(blockv, 0, (unsigned char*)block); 
678 
pixels+=line_size; 
679 
block +=line_size; 
680 
} 
681 
} 
682  
683 
/* next one assumes that ((line_size % 8) == 0) */

684 
static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
685 
{ 
686 
register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 
687 
int i;

688  
689 
for (i = 0; i < h; i++) { 
690 
/* block is 8 bytesaligned, so we're either in the

691 
left block (16 bytesaligned) or in the right block (not) */

692 
int rightside = ((unsigned long)block & 0x0000000F); 
693  
694 
blockv = vec_ld(0, block);

695 
pixelsv1 = vec_ld( 0, pixels);

696 
pixelsv2 = vec_ld(16, pixels);

697 
pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));

698  
699 
if (rightside) {

700 
pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 
701 
} else {

702 
pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 
703 
} 
704  
705 
blockv = vec_avg(blockv, pixelsv); 
706  
707 
vec_st(blockv, 0, block);

708  
709 
pixels += line_size; 
710 
block += line_size; 
711 
} 
712 
} 
713  
714 
/* next one assumes that ((line_size % 8) == 0) */

715 
static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
716 
{ 
717 
register int i; 
718 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
719 
register vector unsigned char blockv, temp1, temp2; 
720 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
721 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
722 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
723  
724 
temp1 = vec_ld(0, pixels);

725 
temp2 = vec_ld(16, pixels);

726 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

727 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
728 
pixelsv2 = temp2; 
729 
} else {

730 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

731 
} 
732 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
733 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
734 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
735 
(vector unsigned short)pixelsv2); 
736 
pixelssum1 = vec_add(pixelssum1, vctwo); 
737  
738 
for (i = 0; i < h ; i++) { 
739 
int rightside = ((unsigned long)block & 0x0000000F); 
740 
blockv = vec_ld(0, block);

741  
742 
temp1 = vec_ld(line_size, pixels); 
743 
temp2 = vec_ld(line_size + 16, pixels);

744 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
745 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
746 
pixelsv2 = temp2; 
747 
} else {

748 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

749 
} 
750  
751 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
752 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
753 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
754 
(vector unsigned short)pixelsv2); 
755 
temp3 = vec_add(pixelssum1, pixelssum2); 
756 
temp3 = vec_sra(temp3, vctwo); 
757 
pixelssum1 = vec_add(pixelssum2, vctwo); 
758 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
759  
760 
if (rightside) {

761 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
762 
} else {

763 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
764 
} 
765  
766 
vec_st(blockv, 0, block);

767  
768 
block += line_size; 
769 
pixels += line_size; 
770 
} 
771 
} 
772  
773 
/* next one assumes that ((line_size % 8) == 0) */

774 
static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
775 
{ 
776 
register int i; 
777 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
778 
register vector unsigned char blockv, temp1, temp2; 
779 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
780 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
781 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
782 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
783  
784 
temp1 = vec_ld(0, pixels);

785 
temp2 = vec_ld(16, pixels);

786 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

787 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
788 
pixelsv2 = temp2; 
789 
} else {

790 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

791 
} 
792 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
793 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
794 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
795 
(vector unsigned short)pixelsv2); 
796 
pixelssum1 = vec_add(pixelssum1, vcone); 
797  
798 
for (i = 0; i < h ; i++) { 
799 
int rightside = ((unsigned long)block & 0x0000000F); 
800 
blockv = vec_ld(0, block);

801  
802 
temp1 = vec_ld(line_size, pixels); 
803 
temp2 = vec_ld(line_size + 16, pixels);

804 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
805 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
806 
pixelsv2 = temp2; 
807 
} else {

808 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

809 
} 
810  
811 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
812 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
813 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
814 
(vector unsigned short)pixelsv2); 
815 
temp3 = vec_add(pixelssum1, pixelssum2); 
816 
temp3 = vec_sra(temp3, vctwo); 
817 
pixelssum1 = vec_add(pixelssum2, vcone); 
818 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
819  
820 
if (rightside) {

821 
blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
822 
} else {

823 
blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
824 
} 
825  
826 
vec_st(blockv, 0, block);

827  
828 
block += line_size; 
829 
pixels += line_size; 
830 
} 
831 
} 
832  
833 
/* next one assumes that ((line_size % 16) == 0) */

834 
static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
835 
{ 
836 
register int i; 
837 
register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
838 
register vector unsigned char blockv, temp1, temp2; 
839 
register vector unsigned short temp3, temp4, 
840 
pixelssum1, pixelssum2, pixelssum3, pixelssum4; 
841 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
842 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
843  
844 
temp1 = vec_ld(0, pixels);

845 
temp2 = vec_ld(16, pixels);

846 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

847 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
848 
pixelsv2 = temp2; 
849 
} else {

850 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

851 
} 
852 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
853 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
854 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
855 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
856 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
857 
(vector unsigned short)pixelsv4); 
858 
pixelssum3 = vec_add(pixelssum3, vctwo); 
859 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
860 
(vector unsigned short)pixelsv2); 
861 
pixelssum1 = vec_add(pixelssum1, vctwo); 
862  
863 
for (i = 0; i < h ; i++) { 
864 
blockv = vec_ld(0, block);

865  
866 
temp1 = vec_ld(line_size, pixels); 
867 
temp2 = vec_ld(line_size + 16, pixels);

868 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
869 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
870 
pixelsv2 = temp2; 
871 
} else {

872 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

873 
} 
874  
875 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
876 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
877 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
878 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
879  
880 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
881 
(vector unsigned short)pixelsv4); 
882 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
883 
(vector unsigned short)pixelsv2); 
884 
temp4 = vec_add(pixelssum3, pixelssum4); 
885 
temp4 = vec_sra(temp4, vctwo); 
886 
temp3 = vec_add(pixelssum1, pixelssum2); 
887 
temp3 = vec_sra(temp3, vctwo); 
888  
889 
pixelssum3 = vec_add(pixelssum4, vctwo); 
890 
pixelssum1 = vec_add(pixelssum2, vctwo); 
891  
892 
blockv = vec_packsu(temp3, temp4); 
893  
894 
vec_st(blockv, 0, block);

895  
896 
block += line_size; 
897 
pixels += line_size; 
898 
} 
899 
} 
900  
901 
/* next one assumes that ((line_size % 16) == 0) */

902 
static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 
903 
{ 
904 
register int i; 
905 
register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 
906 
register vector unsigned char blockv, temp1, temp2; 
907 
register vector unsigned short temp3, temp4, 
908 
pixelssum1, pixelssum2, pixelssum3, pixelssum4; 
909 
register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 
910 
register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 
911 
register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 
912  
913 
temp1 = vec_ld(0, pixels);

914 
temp2 = vec_ld(16, pixels);

915 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

916 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
917 
pixelsv2 = temp2; 
918 
} else {

919 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

920 
} 
921 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
922 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
923 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
924 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
925 
pixelssum3 = vec_add((vector unsigned short)pixelsv3, 
926 
(vector unsigned short)pixelsv4); 
927 
pixelssum3 = vec_add(pixelssum3, vcone); 
928 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
929 
(vector unsigned short)pixelsv2); 
930 
pixelssum1 = vec_add(pixelssum1, vcone); 
931  
932 
for (i = 0; i < h ; i++) { 
933 
blockv = vec_ld(0, block);

934  
935 
temp1 = vec_ld(line_size, pixels); 
936 
temp2 = vec_ld(line_size + 16, pixels);

937 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
938 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
939 
pixelsv2 = temp2; 
940 
} else {

941 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

942 
} 
943  
944 
pixelsv3 = vec_mergel(vczero, pixelsv1); 
945 
pixelsv4 = vec_mergel(vczero, pixelsv2); 
946 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
947 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
948  
949 
pixelssum4 = vec_add((vector unsigned short)pixelsv3, 
950 
(vector unsigned short)pixelsv4); 
951 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
952 
(vector unsigned short)pixelsv2); 
953 
temp4 = vec_add(pixelssum3, pixelssum4); 
954 
temp4 = vec_sra(temp4, vctwo); 
955 
temp3 = vec_add(pixelssum1, pixelssum2); 
956 
temp3 = vec_sra(temp3, vctwo); 
957  
958 
pixelssum3 = vec_add(pixelssum4, vcone); 
959 
pixelssum1 = vec_add(pixelssum2, vcone); 
960  
961 
blockv = vec_packsu(temp3, temp4); 
962  
963 
vec_st(blockv, 0, block);

964  
965 
block += line_size; 
966 
pixels += line_size; 
967 
} 
968 
} 
969  
970 
static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
971 
int sum;

972 
register const vector unsigned char vzero = 
973 
(const vector unsigned char)vec_splat_u8(0); 
974 
register vector signed short temp0, temp1, temp2, temp3, temp4, 
975 
temp5, temp6, temp7; 
976 
{ 
977 
register const vector signed short vprod1 =(const vector signed short) 
978 
{ 1,1, 1,1, 1,1, 1,1 }; 
979 
register const vector signed short vprod2 =(const vector signed short) 
980 
{ 1, 1,1,1, 1, 1,1,1 }; 
981 
register const vector signed short vprod3 =(const vector signed short) 
982 
{ 1, 1, 1, 1,1,1,1,1 }; 
983 
register const vector unsigned char perm1 = (const vector unsigned char) 
984 
{0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
985 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 
986 
register const vector unsigned char perm2 = (const vector unsigned char) 
987 
{0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
988 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 
989 
register const vector unsigned char perm3 = (const vector unsigned char) 
990 
{0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
991 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 
992  
993 
#define ONEITERBUTTERFLY(i, res) \

994 
{ \ 
995 
register vector unsigned char src1, src2, srcO; \ 
996 
register vector unsigned char dst1, dst2, dstO; \ 
997 
register vector signed short srcV, dstV; \ 
998 
register vector signed short but0, but1, but2, op1, op2, op3; \ 
999 
src1 = vec_ld(stride * i, src); \ 
1000 
src2 = vec_ld((stride * i) + 15, src); \

1001 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1002 
dst1 = vec_ld(stride * i, dst); \ 
1003 
dst2 = vec_ld((stride * i) + 15, dst); \

1004 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1005 
/* promote the unsigned chars to signed shorts */ \

1006 
/* we're in the 8x8 function, we only care for the first 8 */ \

1007 
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1008 
(vector signed char)srcO); \ 
1009 
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1010 
(vector signed char)dstO); \ 
1011 
/* subtractions inside the first butterfly */ \

1012 
but0 = vec_sub(srcV, dstV); \ 
1013 
op1 = vec_perm(but0, but0, perm1); \ 
1014 
but1 = vec_mladd(but0, vprod1, op1); \ 
1015 
op2 = vec_perm(but1, but1, perm2); \ 
1016 
but2 = vec_mladd(but1, vprod2, op2); \ 
1017 
op3 = vec_perm(but2, but2, perm3); \ 
1018 
res = vec_mladd(but2, vprod3, op3); \ 
1019 
} 
1020 
ONEITERBUTTERFLY(0, temp0);

1021 
ONEITERBUTTERFLY(1, temp1);

1022 
ONEITERBUTTERFLY(2, temp2);

1023 
ONEITERBUTTERFLY(3, temp3);

1024 
ONEITERBUTTERFLY(4, temp4);

1025 
ONEITERBUTTERFLY(5, temp5);

1026 
ONEITERBUTTERFLY(6, temp6);

1027 
ONEITERBUTTERFLY(7, temp7);

1028 
} 
1029 
#undef ONEITERBUTTERFLY

1030 
{ 
1031 
register vector signed int vsum; 
1032 
register vector signed short line0 = vec_add(temp0, temp1); 
1033 
register vector signed short line1 = vec_sub(temp0, temp1); 
1034 
register vector signed short line2 = vec_add(temp2, temp3); 
1035 
register vector signed short line3 = vec_sub(temp2, temp3); 
1036 
register vector signed short line4 = vec_add(temp4, temp5); 
1037 
register vector signed short line5 = vec_sub(temp4, temp5); 
1038 
register vector signed short line6 = vec_add(temp6, temp7); 
1039 
register vector signed short line7 = vec_sub(temp6, temp7); 
1040  
1041 
register vector signed short line0B = vec_add(line0, line2); 
1042 
register vector signed short line2B = vec_sub(line0, line2); 
1043 
register vector signed short line1B = vec_add(line1, line3); 
1044 
register vector signed short line3B = vec_sub(line1, line3); 
1045 
register vector signed short line4B = vec_add(line4, line6); 
1046 
register vector signed short line6B = vec_sub(line4, line6); 
1047 
register vector signed short line5B = vec_add(line5, line7); 
1048 
register vector signed short line7B = vec_sub(line5, line7); 
1049  
1050 
register vector signed short line0C = vec_add(line0B, line4B); 
1051 
register vector signed short line4C = vec_sub(line0B, line4B); 
1052 
register vector signed short line1C = vec_add(line1B, line5B); 
1053 
register vector signed short line5C = vec_sub(line1B, line5B); 
1054 
register vector signed short line2C = vec_add(line2B, line6B); 
1055 
register vector signed short line6C = vec_sub(line2B, line6B); 
1056 
register vector signed short line3C = vec_add(line3B, line7B); 
1057 
register vector signed short line7C = vec_sub(line3B, line7B); 
1058  
1059 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1060 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1061 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1062 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1063 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1064 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1065 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1066 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1067 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1068 
vsum = vec_splat(vsum, 3);

1069 
vec_ste(vsum, 0, &sum);

1070 
} 
1071 
return sum;

1072 
} 
1073  
1074 
/*

1075 
16x8 works with 16 elements; it allows to avoid replicating loads, and

1076 
give the compiler more rooms for scheduling. It's only used from

1077 
inside hadamard8_diff16_altivec.

1078 

1079 
Unfortunately, it seems gcc3.3 is a bit dumb, and the compiled code has a LOT

1080 
of spill code, it seems gcc (unlike xlc) cannot keep everything in registers

1081 
by itself. The following code include handmade registers allocation. It's not

1082 
clean, but on a 7450 the resulting code is much faster (best case fall from

1083 
700+ cycles to 550).

1084 

1085 
xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,

1086 
and its code isn't much faster than gcc3.3 on the 7450 (but uses 25% less

1087 
instructions...)

1088 

1089 
On the 970, the handmade RA is still a win (around 690 vs. around 780), but

1090 
xlc goes to around 660 on the regular C code...

1091 
*/

1092  
1093 
static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { 
1094 
int sum;

1095 
register vector signed short 
1096 
temp0 __asm__ ("v0"),

1097 
temp1 __asm__ ("v1"),

1098 
temp2 __asm__ ("v2"),

1099 
temp3 __asm__ ("v3"),

1100 
temp4 __asm__ ("v4"),

1101 
temp5 __asm__ ("v5"),

1102 
temp6 __asm__ ("v6"),

1103 
temp7 __asm__ ("v7");

1104 
register vector signed short 
1105 
temp0S __asm__ ("v8"),

1106 
temp1S __asm__ ("v9"),

1107 
temp2S __asm__ ("v10"),

1108 
temp3S __asm__ ("v11"),

1109 
temp4S __asm__ ("v12"),

1110 
temp5S __asm__ ("v13"),

1111 
temp6S __asm__ ("v14"),

1112 
temp7S __asm__ ("v15");

1113 
register const vector unsigned char vzero __asm__ ("v31") = 
1114 
(const vector unsigned char)vec_splat_u8(0); 
1115 
{ 
1116 
register const vector signed short vprod1 __asm__ ("v16") = 
1117 
(const vector signed short){ 1,1, 1,1, 1,1, 1,1 }; 
1118 
register const vector signed short vprod2 __asm__ ("v17") = 
1119 
(const vector signed short){ 1, 1,1,1, 1, 1,1,1 }; 
1120 
register const vector signed short vprod3 __asm__ ("v18") = 
1121 
(const vector signed short){ 1, 1, 1, 1,1,1,1,1 }; 
1122 
register const vector unsigned char perm1 __asm__ ("v19") = 
1123 
(const vector unsigned char) 
1124 
{0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 
1125 
0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 
1126 
register const vector unsigned char perm2 __asm__ ("v20") = 
1127 
(const vector unsigned char) 
1128 
{0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 
1129 
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 
1130 
register const vector unsigned char perm3 __asm__ ("v21") = 
1131 
(const vector unsigned char) 
1132 
{0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 
1133 
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 
1134  
1135 
#define ONEITERBUTTERFLY(i, res1, res2) \

1136 
{ \ 
1137 
register vector unsigned char src1 __asm__ ("v22"), \ 
1138 
src2 __asm__ ("v23"), \

1139 
dst1 __asm__ ("v24"), \

1140 
dst2 __asm__ ("v25"), \

1141 
srcO __asm__ ("v22"), \

1142 
dstO __asm__ ("v23"); \

1143 
\ 
1144 
register vector signed short srcV __asm__ ("v24"), \ 
1145 
dstV __asm__ ("v25"), \

1146 
srcW __asm__ ("v26"), \

1147 
dstW __asm__ ("v27"), \

1148 
but0 __asm__ ("v28"), \

1149 
but0S __asm__ ("v29"), \

1150 
op1 __asm__ ("v30"), \

1151 
but1 __asm__ ("v22"), \

1152 
op1S __asm__ ("v23"), \

1153 
but1S __asm__ ("v24"), \

1154 
op2 __asm__ ("v25"), \

1155 
but2 __asm__ ("v26"), \

1156 
op2S __asm__ ("v27"), \

1157 
but2S __asm__ ("v28"), \

1158 
op3 __asm__ ("v29"), \

1159 
op3S __asm__ ("v30"); \

1160 
\ 
1161 
src1 = vec_ld(stride * i, src); \ 
1162 
src2 = vec_ld((stride * i) + 16, src); \

1163 
srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 
1164 
dst1 = vec_ld(stride * i, dst); \ 
1165 
dst2 = vec_ld((stride * i) + 16, dst); \

1166 
dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 
1167 
/* promote the unsigned chars to signed shorts */ \

1168 
srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1169 
(vector signed char)srcO); \ 
1170 
dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 
1171 
(vector signed char)dstO); \ 
1172 
srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ 
1173 
(vector signed char)srcO); \ 
1174 
dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ 
1175 
(vector signed char)dstO); \ 
1176 
/* subtractions inside the first butterfly */ \

1177 
but0 = vec_sub(srcV, dstV); \ 
1178 
but0S = vec_sub(srcW, dstW); \ 
1179 
op1 = vec_perm(but0, but0, perm1); \ 
1180 
but1 = vec_mladd(but0, vprod1, op1); \ 
1181 
op1S = vec_perm(but0S, but0S, perm1); \ 
1182 
but1S = vec_mladd(but0S, vprod1, op1S); \ 
1183 
op2 = vec_perm(but1, but1, perm2); \ 
1184 
but2 = vec_mladd(but1, vprod2, op2); \ 
1185 
op2S = vec_perm(but1S, but1S, perm2); \ 
1186 
but2S = vec_mladd(but1S, vprod2, op2S); \ 
1187 
op3 = vec_perm(but2, but2, perm3); \ 
1188 
res1 = vec_mladd(but2, vprod3, op3); \ 
1189 
op3S = vec_perm(but2S, but2S, perm3); \ 
1190 
res2 = vec_mladd(but2S, vprod3, op3S); \ 
1191 
} 
1192 
ONEITERBUTTERFLY(0, temp0, temp0S);

1193 
ONEITERBUTTERFLY(1, temp1, temp1S);

1194 
ONEITERBUTTERFLY(2, temp2, temp2S);

1195 
ONEITERBUTTERFLY(3, temp3, temp3S);

1196 
ONEITERBUTTERFLY(4, temp4, temp4S);

1197 
ONEITERBUTTERFLY(5, temp5, temp5S);

1198 
ONEITERBUTTERFLY(6, temp6, temp6S);

1199 
ONEITERBUTTERFLY(7, temp7, temp7S);

1200 
} 
1201 
#undef ONEITERBUTTERFLY

1202 
{ 
1203 
register vector signed int vsum; 
1204 
register vector signed short line0S, line1S, line2S, line3S, line4S, 
1205 
line5S, line6S, line7S, line0BS,line2BS, 
1206 
line1BS,line3BS,line4BS,line6BS,line5BS, 
1207 
line7BS,line0CS,line4CS,line1CS,line5CS, 
1208 
line2CS,line6CS,line3CS,line7CS; 
1209  
1210 
register vector signed short line0 = vec_add(temp0, temp1); 
1211 
register vector signed short line1 = vec_sub(temp0, temp1); 
1212 
register vector signed short line2 = vec_add(temp2, temp3); 
1213 
register vector signed short line3 = vec_sub(temp2, temp3); 
1214 
register vector signed short line4 = vec_add(temp4, temp5); 
1215 
register vector signed short line5 = vec_sub(temp4, temp5); 
1216 
register vector signed short line6 = vec_add(temp6, temp7); 
1217 
register vector signed short line7 = vec_sub(temp6, temp7); 
1218  
1219 
register vector signed short line0B = vec_add(line0, line2); 
1220 
register vector signed short line2B = vec_sub(line0, line2); 
1221 
register vector signed short line1B = vec_add(line1, line3); 
1222 
register vector signed short line3B = vec_sub(line1, line3); 
1223 
register vector signed short line4B = vec_add(line4, line6); 
1224 
register vector signed short line6B = vec_sub(line4, line6); 
1225 
register vector signed short line5B = vec_add(line5, line7); 
1226 
register vector signed short line7B = vec_sub(line5, line7); 
1227  
1228 
register vector signed short line0C = vec_add(line0B, line4B); 
1229 
register vector signed short line4C = vec_sub(line0B, line4B); 
1230 
register vector signed short line1C = vec_add(line1B, line5B); 
1231 
register vector signed short line5C = vec_sub(line1B, line5B); 
1232 
register vector signed short line2C = vec_add(line2B, line6B); 
1233 
register vector signed short line6C = vec_sub(line2B, line6B); 
1234 
register vector signed short line3C = vec_add(line3B, line7B); 
1235 
register vector signed short line7C = vec_sub(line3B, line7B); 
1236  
1237 
vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));

1238 
vsum = vec_sum4s(vec_abs(line1C), vsum); 
1239 
vsum = vec_sum4s(vec_abs(line2C), vsum); 
1240 
vsum = vec_sum4s(vec_abs(line3C), vsum); 
1241 
vsum = vec_sum4s(vec_abs(line4C), vsum); 
1242 
vsum = vec_sum4s(vec_abs(line5C), vsum); 
1243 
vsum = vec_sum4s(vec_abs(line6C), vsum); 
1244 
vsum = vec_sum4s(vec_abs(line7C), vsum); 
1245  
1246 
line0S = vec_add(temp0S, temp1S); 
1247 
line1S = vec_sub(temp0S, temp1S); 
1248 
line2S = vec_add(temp2S, temp3S); 
1249 
line3S = vec_sub(temp2S, temp3S); 
1250 
line4S = vec_add(temp4S, temp5S); 
1251 
line5S = vec_sub(temp4S, temp5S); 
1252 
line6S = vec_add(temp6S, temp7S); 
1253 
line7S = vec_sub(temp6S, temp7S); 
1254  
1255 
line0BS = vec_add(line0S, line2S); 
1256 
line2BS = vec_sub(line0S, line2S); 
1257 
line1BS = vec_add(line1S, line3S); 
1258 
line3BS = vec_sub(line1S, line3S); 
1259 
line4BS = vec_add(line4S, line6S); 
1260 
line6BS = vec_sub(line4S, line6S); 
1261 
line5BS = vec_add(line5S, line7S); 
1262 
line7BS = vec_sub(line5S, line7S); 
1263  
1264 
line0CS = vec_add(line0BS, line4BS); 
1265 
line4CS = vec_sub(line0BS, line4BS); 
1266 
line1CS = vec_add(line1BS, line5BS); 
1267 
line5CS = vec_sub(line1BS, line5BS); 
1268 
line2CS = vec_add(line2BS, line6BS); 
1269 
line6CS = vec_sub(line2BS, line6BS); 
1270 
line3CS = vec_add(line3BS, line7BS); 
1271 
line7CS = vec_sub(line3BS, line7BS); 
1272  
1273 
vsum = vec_sum4s(vec_abs(line0CS), vsum); 
1274 
vsum = vec_sum4s(vec_abs(line1CS), vsum); 
1275 
vsum = vec_sum4s(vec_abs(line2CS), vsum); 
1276 
vsum = vec_sum4s(vec_abs(line3CS), vsum); 
1277 
vsum = vec_sum4s(vec_abs(line4CS), vsum); 
1278 
vsum = vec_sum4s(vec_abs(line5CS), vsum); 
1279 
vsum = vec_sum4s(vec_abs(line6CS), vsum); 
1280 
vsum = vec_sum4s(vec_abs(line7CS), vsum); 
1281 
vsum = vec_sums(vsum, (vector signed int)vzero); 
1282 
vsum = vec_splat(vsum, 3);

1283 
vec_ste(vsum, 0, &sum);

1284 
} 
1285 
return sum;

1286 
} 
1287  
1288 
static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 
1289 
int score;

1290 
score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1291 
if (h==16) { 
1292 
dst += 8*stride;

1293 
src += 8*stride;

1294 
score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);

1295 
} 
1296 
return score;

1297 
} 
1298  
1299 
static void vorbis_inverse_coupling_altivec(float *mag, float *ang, 
1300 
int blocksize)

1301 
{ 
1302 
int i;

1303 
vector float m, a;

1304 
vector bool int t0, t1; 
1305 
const vector unsigned int v_31 = //XXX 
1306 
vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); 
1307 
for (i = 0; i < blocksize; i += 4) { 
1308 
m = vec_ld(0, mag+i);

1309 
a = vec_ld(0, ang+i);

1310 
t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); 
1311 
t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); 
1312 
a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); 
1313 
t0 = (vector bool int)vec_and(a, t1); 
1314 
t1 = (vector bool int)vec_andc(a, t1); 
1315 
a = vec_sub(m, (vector float)t1);

1316 
m = vec_add(m, (vector float)t0);

1317 
vec_stl(a, 0, ang+i);

1318 
vec_stl(m, 0, mag+i);

1319 
} 
1320 
} 
1321  
1322 
/* next one assumes that ((line_size % 8) == 0) */

1323 
static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 
1324 
{ 
1325 
register int i; 
1326 
register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 
1327 
register vector unsigned char blockv, temp1, temp2, blocktemp; 
1328 
register vector unsigned short pixelssum1, pixelssum2, temp3; 
1329  
1330 
register const vector unsigned char vczero = (const vector unsigned char) 
1331 
vec_splat_u8(0);

1332 
register const vector unsigned short vctwo = (const vector unsigned short) 
1333 
vec_splat_u16(2);

1334  
1335 
temp1 = vec_ld(0, pixels);

1336 
temp2 = vec_ld(16, pixels);

1337 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));

1338 
if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 
1339 
pixelsv2 = temp2; 
1340 
} else {

1341 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));

1342 
} 
1343 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1344 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1345 
pixelssum1 = vec_add((vector unsigned short)pixelsv1, 
1346 
(vector unsigned short)pixelsv2); 
1347 
pixelssum1 = vec_add(pixelssum1, vctwo); 
1348  
1349 
for (i = 0; i < h ; i++) { 
1350 
int rightside = ((unsigned long)block & 0x0000000F); 
1351 
blockv = vec_ld(0, block);

1352  
1353 
temp1 = vec_ld(line_size, pixels); 
1354 
temp2 = vec_ld(line_size + 16, pixels);

1355 
pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 
1356 
if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 
1357 
pixelsv2 = temp2; 
1358 
} else {

1359 
pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));

1360 
} 
1361  
1362 
pixelsv1 = vec_mergeh(vczero, pixelsv1); 
1363 
pixelsv2 = vec_mergeh(vczero, pixelsv2); 
1364 
pixelssum2 = vec_add((vector unsigned short)pixelsv1, 
1365 
(vector unsigned short)pixelsv2); 
1366 
temp3 = vec_add(pixelssum1, pixelssum2); 
1367 
temp3 = vec_sra(temp3, vctwo); 
1368 
pixelssum1 = vec_add(pixelssum2, vctwo); 
1369 
pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 
1370  
1371 
if (rightside) {

1372 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 
1373 
} else {

1374 
blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 
1375 
} 
1376  
1377 
blockv = vec_avg(blocktemp, blockv); 
1378 
vec_st(blockv, 0, block);

1379  
1380 
block += line_size; 
1381 
pixels += line_size; 
1382 
} 
1383 
} 
1384  
1385 
void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)

1386 
{ 
1387 
c>pix_abs[0][1] = sad16_x2_altivec; 
1388 
c>pix_abs[0][2] = sad16_y2_altivec; 
1389 
c>pix_abs[0][3] = sad16_xy2_altivec; 
1390 
c>pix_abs[0][0] = sad16_altivec; 
1391 
c>pix_abs[1][0] = sad8_altivec; 
1392 
c>sad[0]= sad16_altivec;

1393 
c>sad[1]= sad8_altivec;

1394 
c>pix_norm1 = pix_norm1_altivec; 
1395 
c>sse[1]= sse8_altivec;

1396 
c>sse[0]= sse16_altivec;

1397 
c>pix_sum = pix_sum_altivec; 
1398 
c>diff_pixels = diff_pixels_altivec; 
1399 
c>get_pixels = get_pixels_altivec; 
1400 
c>clear_block = clear_block_altivec; 
1401 
c>add_bytes= add_bytes_altivec; 
1402 
c>put_pixels_tab[0][0] = put_pixels16_altivec; 
1403 
/* the two functions do the same thing, so use the same code */

1404 
c>put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 
1405 
c>avg_pixels_tab[0][0] = avg_pixels16_altivec; 
1406 
c>avg_pixels_tab[1][0] = avg_pixels8_altivec; 
1407 
c>avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 
1408 
c>put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 
1409 
c>put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 
1410 
c>put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 
1411 
c>put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 
1412  
1413 
c>hadamard8_diff[0] = hadamard8_diff16_altivec;

1414 
c>hadamard8_diff[1] = hadamard8_diff8x8_altivec;

1415 
if (CONFIG_VORBIS_DECODER)

1416 
c>vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; 
1417 
} 