1 
/*


2 
marc.hoffman@analog.com March 8, 2004

3 

4 
Altivec Acceleration for Color Space Conversion revision 0.2

5 

6 
convert I420 YV12 to RGB in various formats,

7 
it rejects images that are not in 420 formats

8 
it rejects images that don't have widths of multiples of 16

9 
it rejects images that don't have heights of multiples of 2

10 
reject defers to C simulation codes.

11 

12 
lots of optimizations to be done here

13 

14 
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.

15 
so we currently use max min to clip

16 

17 
2. the inefficient use of chroma loading needs a bit of brushing up

18 

19 
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls

20 

21 

22 
MODIFIED to calculate coeffs from currently selected color space.

23 
MODIFIED core to be a macro which you spec the output format.

24 
ADDED UYVY conversion which is never called due to some thing in SWSCALE.

25 
CORRECTED algorithim selection to be strict on input formats.

26 
ADDED runtime detection of altivec.

27 

28 
ADDED altivec_yuv2packedX vertical scl + RGB converter

29 

30 
March 27,2004

31 
PERFORMANCE ANALYSIS

32 

33 
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test

34 
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence

35 

36 
720*480*30 ~10MPS

37 

38 
so we have roughly 10clocks per pixel this is too high something has to be wrong.

39 

40 
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.

41 

42 
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much

43 
guaranteed to have the input video frame it was just decompressed so

44 
it probably resides in L1 caches. However we are creating the

45 
output video stream this needs to use the DSTST instruction to

46 
optimize for the cache. We couple this with the fact that we are

47 
not going to be visiting the input buffer again so we mark it Least

48 
Recently Used. This shaves 25% of the processor cycles off.

49 

50 
Now MEMCPY is the largest mips consumer in the system, probably due

51 
to the inefficient X11 stuff.

52 

53 
GL libraries seem to be very slow on this machine 1.33Ghz PB running

54 
Jaguar, this is not the case for my 1Ghz PB. I thought it might be

55 
a versioning issues, however i have libGL.1.2.dylib for both

56 
machines. ((We need to figure this out now))

57 

58 
GL2 libraries work now with patch for RGB32

59 

60 
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor

61 

62 
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.

63 

64 
*/

65 
#include <stdio.h> 
66 
#include <stdlib.h> 
67 
#include <string.h> 
68 
#include <inttypes.h> 
69 
#include <assert.h> 
70 
#include "config.h" 
71 
#include "rgb2rgb.h" 
72 
#include "swscale.h" 
73 
#include "swscale_internal.h" 
74 
#include "../mangle.h" 
75 
#include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff 
76  
77 
#undef PROFILE_THE_BEAST

78 
#undef INC_SCALING

79  
80 
typedef unsigned char ubyte; 
81 
typedef signed char sbyte; 
82  
83  
84 
/* RGB interleaver, 16 planar pels 8bit samples per channel in

85 
homogeneous vector registers x0,x1,x2 are interleaved with the

86 
following technique:

87 

88 
o0 = vec_mergeh (x0,x1);

89 
o1 = vec_perm (o0, x2, perm_rgb_0);

90 
o2 = vec_perm (o0, x2, perm_rgb_1);

91 
o3 = vec_mergel (x0,x1);

92 
o4 = vec_perm (o3,o2,perm_rgb_2);

93 
o5 = vec_perm (o3,o2,perm_rgb_3);

94 

95 
perm_rgb_0: o0(RG).h v1(B) > o1*

96 
0 1 2 3 4

97 
rgbrgbrgbrgbrgbr

98 
0010 0100 1001 0010

99 
0102 3145 2673 894A

100 

101 
perm_rgb_1: o0(RG).h v1(B) > o2

102 
0 1 2 3 4

103 
gbrgbrgbbbbbbbbb

104 
0100 1001 1111 1111

105 
B5CD 6EF7 89AB CDEF

106 

107 
perm_rgb_2: o3(RG).l o2(rgbB.l) > o4*

108 
0 1 2 3 4

109 
gbrgbrgbrgbrgbrg

110 
1111 1111 0010 0100

111 
89AB CDEF 0182 3945

112 

113 
perm_rgb_2: o3(RG).l o2(rgbB.l) > o5*

114 
0 1 2 3 4

115 
brgbrgbrgbrgbrgb

116 
1001 0010 0100 1001

117 
a67b 89cA BdCD eEFf

118 

119 
*/

120 
static

121 
const vector unsigned char 
122 
perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05, 
123 
0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a), 
124 
perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17, 
125 
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f), 
126 
perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 
127 
0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05), 
128 
perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a, 
129 
0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f); 
130  
131 
#define vec_merge3(x2,x1,x0,y0,y1,y2) \

132 
do { \

133 
typeof(x0) o0,o2,o3; \ 
134 
o0 = vec_mergeh (x0,x1); \ 
135 
y0 = vec_perm (o0, x2, perm_rgb_0);\ 
136 
o2 = vec_perm (o0, x2, perm_rgb_1);\ 
137 
o3 = vec_mergel (x0,x1); \ 
138 
y1 = vec_perm (o3,o2,perm_rgb_2); \ 
139 
y2 = vec_perm (o3,o2,perm_rgb_3); \ 
140 
} while(0) 
141  
142 
#define vec_mstrgb24(x0,x1,x2,ptr) \

143 
do { \

144 
typeof(x0) _0,_1,_2; \ 
145 
vec_merge3 (x0,x1,x2,_0,_1,_2); \ 
146 
vec_st (_0, 0, ptr++); \

147 
vec_st (_1, 0, ptr++); \

148 
vec_st (_2, 0, ptr++); \

149 
} while (0); 
150  
151 
#define vec_mstbgr24(x0,x1,x2,ptr) \

152 
do { \

153 
typeof(x0) _0,_1,_2; \ 
154 
vec_merge3 (x2,x1,x0,_0,_1,_2); \ 
155 
vec_st (_0, 0, ptr++); \

156 
vec_st (_1, 0, ptr++); \

157 
vec_st (_2, 0, ptr++); \

158 
} while (0); 
159  
160 
/* pack the pixels in rgb0 format

161 
msb R

162 
lsb 0

163 
*/

164 
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr) \

165 
do { \

166 
T _0,_1,_2,_3; \ 
167 
_0 = vec_mergeh (x0,x1); \ 
168 
_1 = vec_mergeh (x2,x3); \ 
169 
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 
170 
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 
171 
vec_st (_2, 0*16, (T *)ptr); \ 
172 
vec_st (_3, 1*16, (T *)ptr); \ 
173 
_0 = vec_mergel (x0,x1); \ 
174 
_1 = vec_mergel (x2,x3); \ 
175 
_2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \ 
176 
_3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \ 
177 
vec_st (_2, 2*16, (T *)ptr); \ 
178 
vec_st (_3, 3*16, (T *)ptr); \ 
179 
ptr += 4; \

180 
} while (0); 
181  
182 
/*

183 

184 
 1 0 1.4021   Y 

185 
 1 0.3441 0.7142 x Cb

186 
 1 1.7718 0   Cr

187 

188 

189 
Y: [128 127]

190 
Cb/Cr : [128 127]

191 

192 
typical yuv conversion work on Y: 0255 this version has been optimized for jpeg decode.

193 

194 
*/

195  
196  
197  
198  
199 
#define vec_unh(x) \

200 
(vector signed short) \ 
201 
vec_perm(x,(typeof(x))AVV(0),\

202 
(vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\ 
203 
0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07)) 
204 
#define vec_unl(x) \

205 
(vector signed short) \ 
206 
vec_perm(x,(typeof(x))AVV(0),\

207 
(vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\ 
208 
0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F)) 
209  
210 
#define vec_clip(x) \

211 
vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16)) 
212  
213 
#define vec_packclp_a(x,y) \

214 
(vector unsigned char)vec_pack (vec_clip (x), vec_clip (y)) 
215  
216 
#define vec_packclp(x,y) \

217 
(vector unsigned char)vec_packs \ 
218 
((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \ 
219 
(vector unsigned short)vec_max (y,(vector signed short) AVV(0))) 
220  
221 
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)

222  
223  
224 
static inline void cvtyuvtoRGB (SwsContext *c, 
225 
vector signed short Y, vector signed short U, vector signed short V, 
226 
vector signed short *R, vector signed short *G, vector signed short *B) 
227 
{ 
228 
vector signed short vx,ux,uvx; 
229  
230 
Y = vec_mradds (Y, c>CY, c>OY); 
231 
U = vec_sub (U,(vector signed short) 
232 
vec_splat((vector signed short)AVV(128),0)); 
233 
V = vec_sub (V,(vector signed short) 
234 
vec_splat((vector signed short)AVV(128),0)); 
235  
236 
// ux = (CBU*(u<<c>CSHIFT)+0x4000)>>15;

237 
ux = vec_sl (U, c>CSHIFT); 
238 
*B = vec_mradds (ux, c>CBU, Y); 
239  
240 
// vx = (CRV*(v<<c>CSHIFT)+0x4000)>>15;

241 
vx = vec_sl (V, c>CSHIFT); 
242 
*R = vec_mradds (vx, c>CRV, Y); 
243  
244 
// uvx = ((CGU*u) + (CGV*v))>>15;

245 
uvx = vec_mradds (U, c>CGU, Y); 
246 
*G = vec_mradds (V, c>CGV, uvx); 
247 
} 
248  
249  
250 
/*

251 


252 
CS converters

253 


254 
*/

255  
256  
257 
#define DEFCSP420_CVT(name,out_pixels) \

258 
static int altivec_##name (SwsContext *c, \ 
259 
unsigned char **in, int *instrides, \ 
260 
int srcSliceY, int srcSliceH, \ 
261 
unsigned char **oplanes, int *outstrides) \ 
262 
{ \ 
263 
int w = c>srcW; \

264 
int h = srcSliceH; \

265 
int i,j; \

266 
int instrides_scl[3]; \ 
267 
vector unsigned char y0,y1; \ 
268 
\ 
269 
vector signed char u,v; \ 
270 
\ 
271 
vector signed short Y0,Y1,Y2,Y3; \ 
272 
vector signed short U,V; \ 
273 
vector signed short vx,ux,uvx; \ 
274 
vector signed short vx0,ux0,uvx0; \ 
275 
vector signed short vx1,ux1,uvx1; \ 
276 
vector signed short R0,G0,B0; \ 
277 
vector signed short R1,G1,B1; \ 
278 
vector unsigned char R,G,B; \ 
279 
\ 
280 
vector unsigned char *uivP, *vivP; \ 
281 
vector unsigned char align_perm; \ 
282 
\ 
283 
vector signed short \ 
284 
lCY = c>CY, \ 
285 
lOY = c>OY, \ 
286 
lCRV = c>CRV, \ 
287 
lCBU = c>CBU, \ 
288 
lCGU = c>CGU, \ 
289 
lCGV = c>CGV; \ 
290 
\ 
291 
vector unsigned short lCSHIFT = c>CSHIFT; \ 
292 
\ 
293 
ubyte *y1i = in[0]; \

294 
ubyte *y2i = in[0]+w; \

295 
ubyte *ui = in[1]; \

296 
ubyte *vi = in[2]; \

297 
\ 
298 
vector unsigned char *oute \ 
299 
= (vector unsigned char *) \ 
300 
(oplanes[0]+srcSliceY*outstrides[0]); \ 
301 
vector unsigned char *outo \ 
302 
= (vector unsigned char *) \ 
303 
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); \ 
304 
\ 
305 
\ 
306 
instrides_scl[0] = instrides[0]; \ 
307 
instrides_scl[1] = instrides[1]w/2; /* the loop moves ui by w/2 */ \ 
308 
instrides_scl[2] = instrides[2]w/2; /* the loop moves vi by w/2 */ \ 
309 
\ 
310 
\ 
311 
for (i=0;i<h/2;i++) { \ 
312 
vec_dstst (outo, (0x02000002(((w*3+32)/32)<<16)), 0); \ 
313 
vec_dstst (oute, (0x02000002(((w*3+32)/32)<<16)), 1); \ 
314 
\ 
315 
for (j=0;j<w/16;j++) { \ 
316 
\ 
317 
y0 = vec_ldl (0,y1i); \

318 
y1 = vec_ldl (0,y2i); \

319 
uivP = (vector unsigned char *)ui; \ 
320 
vivP = (vector unsigned char *)vi; \ 
321 
\ 
322 
align_perm = vec_lvsl (0, ui); \

323 
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); \ 
324 
\ 
325 
align_perm = vec_lvsl (0, vi); \

326 
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); \ 
327 
\ 
328 
u = (vector signed char) \ 
329 
vec_sub (u,(vector signed char) \ 
330 
vec_splat((vector signed char)AVV(128),0));\ 
331 
v = (vector signed char) \ 
332 
vec_sub (v,(vector signed char) \ 
333 
vec_splat((vector signed char)AVV(128),0));\ 
334 
\ 
335 
U = vec_unpackh (u); \ 
336 
V = vec_unpackh (v); \ 
337 
\ 
338 
\ 
339 
Y0 = vec_unh (y0); \ 
340 
Y1 = vec_unl (y0); \ 
341 
Y2 = vec_unh (y1); \ 
342 
Y3 = vec_unl (y1); \ 
343 
\ 
344 
Y0 = vec_mradds (Y0, lCY, lOY); \ 
345 
Y1 = vec_mradds (Y1, lCY, lOY); \ 
346 
Y2 = vec_mradds (Y2, lCY, lOY); \ 
347 
Y3 = vec_mradds (Y3, lCY, lOY); \ 
348 
\ 
349 
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */ \

350 
ux = vec_sl (U, lCSHIFT); \ 
351 
ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); \ 
352 
ux0 = vec_mergeh (ux,ux); \ 
353 
ux1 = vec_mergel (ux,ux); \ 
354 
\ 
355 
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */ \

356 
vx = vec_sl (V, lCSHIFT); \ 
357 
vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); \ 
358 
vx0 = vec_mergeh (vx,vx); \ 
359 
vx1 = vec_mergel (vx,vx); \ 
360 
\ 
361 
/* uvx = ((CGU*u) + (CGV*v))>>15 */ \

362 
uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); \ 
363 
uvx = vec_mradds (V, lCGV, uvx); \ 
364 
uvx0 = vec_mergeh (uvx,uvx); \ 
365 
uvx1 = vec_mergel (uvx,uvx); \ 
366 
\ 
367 
R0 = vec_add (Y0,vx0); \ 
368 
G0 = vec_add (Y0,uvx0); \ 
369 
B0 = vec_add (Y0,ux0); \ 
370 
R1 = vec_add (Y1,vx1); \ 
371 
G1 = vec_add (Y1,uvx1); \ 
372 
B1 = vec_add (Y1,ux1); \ 
373 
\ 
374 
R = vec_packclp (R0,R1); \ 
375 
G = vec_packclp (G0,G1); \ 
376 
B = vec_packclp (B0,B1); \ 
377 
\ 
378 
out_pixels(R,G,B,oute); \ 
379 
\ 
380 
R0 = vec_add (Y2,vx0); \ 
381 
G0 = vec_add (Y2,uvx0); \ 
382 
B0 = vec_add (Y2,ux0); \ 
383 
R1 = vec_add (Y3,vx1); \ 
384 
G1 = vec_add (Y3,uvx1); \ 
385 
B1 = vec_add (Y3,ux1); \ 
386 
R = vec_packclp (R0,R1); \ 
387 
G = vec_packclp (G0,G1); \ 
388 
B = vec_packclp (B0,B1); \ 
389 
\ 
390 
\ 
391 
out_pixels(R,G,B,outo); \ 
392 
\ 
393 
y1i += 16; \

394 
y2i += 16; \

395 
ui += 8; \

396 
vi += 8; \

397 
\ 
398 
} \ 
399 
\ 
400 
outo += (outstrides[0])>>4; \ 
401 
oute += (outstrides[0])>>4; \ 
402 
\ 
403 
ui += instrides_scl[1]; \

404 
vi += instrides_scl[2]; \

405 
y1i += instrides_scl[0]; \

406 
y2i += instrides_scl[0]; \

407 
} \ 
408 
return srcSliceH; \

409 
} 
410  
411  
412 
#define out_abgr(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr) 
413 
#define out_bgra(a,b,c,ptr) vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr) 
414 
#define out_rgba(a,b,c,ptr) vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr) 
415 
#define out_argb(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr) 
416 
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)

417 
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)

418  
419 
DEFCSP420_CVT (yuv2_abgr32, out_abgr) 
420 
#if 1 
421 
DEFCSP420_CVT (yuv2_bgra32, out_argb) 
422 
#else

423 
static int altivec_yuv2_bgra32 (SwsContext *c, 
424 
unsigned char **in, int *instrides, 
425 
int srcSliceY, int srcSliceH, 
426 
unsigned char **oplanes, int *outstrides) 
427 
{ 
428 
int w = c>srcW;

429 
int h = srcSliceH;

430 
int i,j;

431 
int instrides_scl[3]; 
432 
vector unsigned char y0,y1; 
433 

434 
vector signed char u,v; 
435 

436 
vector signed short Y0,Y1,Y2,Y3; 
437 
vector signed short U,V; 
438 
vector signed short vx,ux,uvx; 
439 
vector signed short vx0,ux0,uvx0; 
440 
vector signed short vx1,ux1,uvx1; 
441 
vector signed short R0,G0,B0; 
442 
vector signed short R1,G1,B1; 
443 
vector unsigned char R,G,B; 
444 

445 
vector unsigned char *uivP, *vivP; 
446 
vector unsigned char align_perm; 
447 

448 
vector signed short 
449 
lCY = c>CY, 
450 
lOY = c>OY, 
451 
lCRV = c>CRV, 
452 
lCBU = c>CBU, 
453 
lCGU = c>CGU, 
454 
lCGV = c>CGV; 
455 

456 
vector unsigned short lCSHIFT = c>CSHIFT; 
457 

458 
ubyte *y1i = in[0];

459 
ubyte *y2i = in[0]+w;

460 
ubyte *ui = in[1];

461 
ubyte *vi = in[2];

462 

463 
vector unsigned char *oute 
464 
= (vector unsigned char *) 
465 
(oplanes[0]+srcSliceY*outstrides[0]); 
466 
vector unsigned char *outo 
467 
= (vector unsigned char *) 
468 
(oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]); 
469 

470 

471 
instrides_scl[0] = instrides[0]; 
472 
instrides_scl[1] = instrides[1]w/2; /* the loop moves ui by w/2 */ 
473 
instrides_scl[2] = instrides[2]w/2; /* the loop moves vi by w/2 */ 
474 

475 

476 
for (i=0;i<h/2;i++) { 
477 
vec_dstst (outo, (0x02000002(((w*3+32)/32)<<16)), 0); 
478 
vec_dstst (oute, (0x02000002(((w*3+32)/32)<<16)), 1); 
479 

480 
for (j=0;j<w/16;j++) { 
481 

482 
y0 = vec_ldl (0,y1i);

483 
y1 = vec_ldl (0,y2i);

484 
uivP = (vector unsigned char *)ui; 
485 
vivP = (vector unsigned char *)vi; 
486 

487 
align_perm = vec_lvsl (0, ui);

488 
u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm); 
489 

490 
align_perm = vec_lvsl (0, vi);

491 
v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm); 
492 
u = (vector signed char) 
493 
vec_sub (u,(vector signed char) 
494 
vec_splat((vector signed char)AVV(128),0)); 
495 

496 
v = (vector signed char) 
497 
vec_sub (v, (vector signed char) 
498 
vec_splat((vector signed char)AVV(128),0)); 
499 

500 
U = vec_unpackh (u); 
501 
V = vec_unpackh (v); 
502 

503 

504 
Y0 = vec_unh (y0); 
505 
Y1 = vec_unl (y0); 
506 
Y2 = vec_unh (y1); 
507 
Y3 = vec_unl (y1); 
508 

509 
Y0 = vec_mradds (Y0, lCY, lOY); 
510 
Y1 = vec_mradds (Y1, lCY, lOY); 
511 
Y2 = vec_mradds (Y2, lCY, lOY); 
512 
Y3 = vec_mradds (Y3, lCY, lOY); 
513 

514 
/* ux = (CBU*(u<<CSHIFT)+0x4000)>>15 */

515 
ux = vec_sl (U, lCSHIFT); 
516 
ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0)); 
517 
ux0 = vec_mergeh (ux,ux); 
518 
ux1 = vec_mergel (ux,ux); 
519 

520 
/* vx = (CRV*(v<<CSHIFT)+0x4000)>>15; */

521 
vx = vec_sl (V, lCSHIFT); 
522 
vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0)); 
523 
vx0 = vec_mergeh (vx,vx); 
524 
vx1 = vec_mergel (vx,vx); 
525 
/* uvx = ((CGU*u) + (CGV*v))>>15 */

526 
uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0)); 
527 
uvx = vec_mradds (V, lCGV, uvx); 
528 
uvx0 = vec_mergeh (uvx,uvx); 
529 
uvx1 = vec_mergel (uvx,uvx); 
530 
R0 = vec_add (Y0,vx0); 
531 
G0 = vec_add (Y0,uvx0); 
532 
B0 = vec_add (Y0,ux0); 
533 
R1 = vec_add (Y1,vx1); 
534 
G1 = vec_add (Y1,uvx1); 
535 
B1 = vec_add (Y1,ux1); 
536 
R = vec_packclp (R0,R1); 
537 
G = vec_packclp (G0,G1); 
538 
B = vec_packclp (B0,B1); 
539 

540 
out_argb(R,G,B,oute); 
541 
R0 = vec_add (Y2,vx0); 
542 
G0 = vec_add (Y2,uvx0); 
543 
B0 = vec_add (Y2,ux0); 
544 
R1 = vec_add (Y3,vx1); 
545 
G1 = vec_add (Y3,uvx1); 
546 
B1 = vec_add (Y3,ux1); 
547 
R = vec_packclp (R0,R1); 
548 
G = vec_packclp (G0,G1); 
549 
B = vec_packclp (B0,B1); 
550 

551 
out_argb(R,G,B,outo); 
552 
y1i += 16;

553 
y2i += 16;

554 
ui += 8;

555 
vi += 8;

556 

557 
} 
558 

559 
outo += (outstrides[0])>>4; 
560 
oute += (outstrides[0])>>4; 
561 

562 
ui += instrides_scl[1];

563 
vi += instrides_scl[2];

564 
y1i += instrides_scl[0];

565 
y2i += instrides_scl[0];

566 
} 
567 
return srcSliceH;

568 
} 
569  
570 
#endif

571  
572  
573 
DEFCSP420_CVT (yuv2_rgba32, out_rgba) 
574 
DEFCSP420_CVT (yuv2_argb32, out_argb) 
575 
DEFCSP420_CVT (yuv2_rgb24, out_rgb24) 
576 
DEFCSP420_CVT (yuv2_bgr24, out_bgr24) 
577  
578  
579 
// uyvyuyvyuyvyuyvy

580 
// 0123 4567 89ab cdef

581 
static

582 
const vector unsigned char 
583 
demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00, 
584 
0x10,0x04,0x10,0x04, 
585 
0x10,0x08,0x10,0x08, 
586 
0x10,0x0c,0x10,0x0c), 
587 
demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02, 
588 
0x10,0x06,0x10,0x06, 
589 
0x10,0x0A,0x10,0x0A, 
590 
0x10,0x0E,0x10,0x0E), 
591 
demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03, 
592 
0x10,0x05,0x10,0x07, 
593 
0x10,0x09,0x10,0x0B, 
594 
0x10,0x0D,0x10,0x0F); 
595  
596 
/*

597 
this is so I can play live CCIR raw video

598 
*/

599 
static int altivec_uyvy_rgb32 (SwsContext *c, 
600 
unsigned char **in, int *instrides, 
601 
int srcSliceY, int srcSliceH, 
602 
unsigned char **oplanes, int *outstrides) 
603 
{ 
604 
int w = c>srcW;

605 
int h = srcSliceH;

606 
int i,j;

607 
vector unsigned char uyvy; 
608 
vector signed short Y,U,V; 
609 
vector signed short vx,ux,uvx; 
610 
vector signed short R0,G0,B0,R1,G1,B1; 
611 
vector unsigned char R,G,B; 
612 
vector unsigned char *out; 
613 
ubyte *img; 
614  
615 
img = in[0];

616 
out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]); 
617  
618 
for (i=0;i<h;i++) { 
619 
for (j=0;j<w/16;j++) { 
620 
uyvy = vec_ld (0, img);

621 
U = (vector signed short) 
622 
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); 
623  
624 
V = (vector signed short) 
625 
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); 
626  
627 
Y = (vector signed short) 
628 
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); 
629  
630 
cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0); 
631  
632 
uyvy = vec_ld (16, img);

633 
U = (vector signed short) 
634 
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u); 
635  
636 
V = (vector signed short) 
637 
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v); 
638  
639 
Y = (vector signed short) 
640 
vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y); 
641  
642 
cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1); 
643  
644 
R = vec_packclp (R0,R1); 
645 
G = vec_packclp (G0,G1); 
646 
B = vec_packclp (B0,B1); 
647  
648 
// vec_mstbgr24 (R,G,B, out);

649 
out_rgba (R,G,B,out); 
650  
651 
img += 32;

652 
} 
653 
} 
654 
return srcSliceH;

655 
} 
656  
657  
658  
659 
/* Ok currently the acceleration routine only supports

660 
inputs of widths a multiple of 16

661 
and heights a multiple 2

662 

663 
So we just fall back to the C codes for this.

664 
*/

665 
SwsFunc yuv2rgb_init_altivec (SwsContext *c) 
666 
{ 
667 
if (!(c>flags & SWS_CPU_CAPS_ALTIVEC))

668 
return NULL; 
669  
670 
/*

671 
and this seems not to matter too much I tried a bunch of

672 
videos with abnormal widths and mplayer crashes else where.

673 
mplayer vo x11 rawvideo on:w=350:h=240 raw350x240.eyuv

674 
boom with X11 bad match.

675 

676 
*/

677 
if ((c>srcW & 0xf) != 0) return NULL; 
678  
679 
switch (c>srcFormat) {

680 
case IMGFMT_YVU9:

681 
case IMGFMT_IF09:

682 
case IMGFMT_YV12:

683 
case IMGFMT_I420:

684 
case IMGFMT_IYUV:

685 
case IMGFMT_CLPL:

686 
case IMGFMT_Y800:

687 
case IMGFMT_Y8:

688 
case IMGFMT_NV12:

689 
case IMGFMT_NV21:

690 
if ((c>srcH & 0x1) != 0) 
691 
return NULL; 
692  
693 
switch(c>dstFormat){

694 
case IMGFMT_RGB24:

695 
MSG_WARN("ALTIVEC: Color Space RGB24\n");

696 
return altivec_yuv2_rgb24;

697 
case IMGFMT_BGR24:

698 
MSG_WARN("ALTIVEC: Color Space BGR24\n");

699 
return altivec_yuv2_bgr24;

700 
case IMGFMT_RGB32:

701 
MSG_WARN("ALTIVEC: Color Space ARGB32\n");

702 
return altivec_yuv2_argb32;

703 
case IMGFMT_BGR32:

704 
MSG_WARN("ALTIVEC: Color Space BGRA32\n");

705 
// return profile_altivec_bgra32;

706  
707 
return altivec_yuv2_bgra32;

708 
default: return NULL; 
709 
} 
710 
break;

711  
712 
case IMGFMT_UYVY:

713 
switch(c>dstFormat){

714 
case IMGFMT_RGB32:

715 
MSG_WARN("ALTIVEC: Color Space UYVY > RGB32\n");

716 
return altivec_uyvy_rgb32;

717 
default: return NULL; 
718 
} 
719 
break;

720  
721 
} 
722 
return NULL; 
723 
} 
724  
725 
static uint16_t roundToInt16(int64_t f){

726 
int r= (f + (1<<15))>>16; 
727 
if(r<0x7FFF) return 0x8000; 
728 
else if(r> 0x7FFF) return 0x7FFF; 
729 
else return r; 
730 
} 
731  
732 
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation) 
733 
{ 
734 
union {

735 
signed short tmp[8] __attribute__ ((aligned(16))); 
736 
vector signed short vec; 
737 
} buf; 
738  
739 
buf.tmp[0] = ( (0xffffLL) * contrast>>8 )>>9; //cy 
740 
buf.tmp[1] = 256*brightness; //oy 
741 
buf.tmp[2] = (inv_table[0]>>3) *(contrast>>16)*(saturation>>16); //crv 
742 
buf.tmp[3] = (inv_table[1]>>3) *(contrast>>16)*(saturation>>16); //cbu 
743 
buf.tmp[4] = ((inv_table[2]>>1)*(contrast>>16)*(saturation>>16)); //cgu 
744 
buf.tmp[5] = ((inv_table[3]>>1)*(contrast>>16)*(saturation>>16)); //cgv 
745  
746  
747 
c>CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0); 
748 
c>CY = vec_splat ((vector signed short)buf.vec, 0); 
749 
c>OY = vec_splat ((vector signed short)buf.vec, 1); 
750 
c>CRV = vec_splat ((vector signed short)buf.vec, 2); 
751 
c>CBU = vec_splat ((vector signed short)buf.vec, 3); 
752 
c>CGU = vec_splat ((vector signed short)buf.vec, 4); 
753 
c>CGV = vec_splat ((vector signed short)buf.vec, 5); 
754 
#if 0

755 
{

756 
int i;

757 
char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};

758 
for (i=0; i<6;i++)

759 
printf("%s %d ", v[i],buf.tmp[i] );

760 
printf("\n");

761 
}

762 
#endif

763 
return;

764 
} 
765  
766  
767 
void

768 
altivec_yuv2packedX (SwsContext *c, 
769 
int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,

770 
int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,

771 
uint8_t *dest, int dstW, int dstY) 
772 
{ 
773 
int i,j;

774 
short tmp __attribute__((aligned (16))); 
775 
int16_t *p; 
776 
short *f;

777 
vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V; 
778 
vector signed short R0,G0,B0,R1,G1,B1; 
779  
780 
vector unsigned char R,G,B,pels[3]; 
781 
vector unsigned char *out,*nout; 
782  
783 
vector signed short RND = vec_splat((vector signed short)AVV(1<<3),0); 
784 
vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0); 
785 
unsigned long scratch[16] __attribute__ ((aligned (16))); 
786  
787 
vector signed short *vYCoeffsBank, *vCCoeffsBank; 
788  
789 
vector signed short *YCoeffs, *CCoeffs; 
790  
791 
vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW); 
792 
vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW); 
793  
794 
for (i=0;i<lumFilterSize*dstW;i++) { 
795 
tmp = c>vLumFilter[i]; 
796 
p = &vYCoeffsBank[i]; 
797 
for (j=0;j<8;j++) 
798 
p[j] = tmp; 
799 
} 
800  
801 
for (i=0;i<chrFilterSize*dstW;i++) { 
802 
tmp = c>vChrFilter[i]; 
803 
p = &vCCoeffsBank[i]; 
804 
for (j=0;j<8;j++) 
805 
p[j] = tmp; 
806 
} 
807  
808 
YCoeffs = vYCoeffsBank+dstY*lumFilterSize; 
809 
CCoeffs = vCCoeffsBank+dstY*chrFilterSize; 
810  
811 
out = (vector unsigned char *)dest; 
812  
813 
for(i=0; i<dstW; i+=16){ 
814 
Y0 = RND; 
815 
Y1 = RND; 
816 
/* extract 16 coeffs from lumSrc */

817 
for(j=0; j<lumFilterSize; j++) { 
818 
X0 = vec_ld (0, &lumSrc[j][i]);

819 
X1 = vec_ld (16, &lumSrc[j][i]);

820 
Y0 = vec_mradds (X0, YCoeffs[j], Y0); 
821 
Y1 = vec_mradds (X1, YCoeffs[j], Y1); 
822 
} 
823  
824 
U = RND; 
825 
V = RND; 
826 
/* extract 8 coeffs from U,V */

827 
for(j=0; j<chrFilterSize; j++) { 
828 
X = vec_ld (0, &chrSrc[j][i/2]); 
829 
U = vec_mradds (X, CCoeffs[j], U); 
830 
X = vec_ld (0, &chrSrc[j][i/2+2048]); 
831 
V = vec_mradds (X, CCoeffs[j], V); 
832 
} 
833  
834 
/* scale and clip signals */

835 
Y0 = vec_sra (Y0, SCL); 
836 
Y1 = vec_sra (Y1, SCL); 
837 
U = vec_sra (U, SCL); 
838 
V = vec_sra (V, SCL); 
839  
840 
Y0 = vec_clip (Y0); 
841 
Y1 = vec_clip (Y1); 
842 
U = vec_clip (U); 
843 
V = vec_clip (V); 
844  
845 
/* now we have

846 
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15

847 
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7

848 

849 
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15

850 
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7

851 
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7

852 
*/

853  
854 
U0 = vec_mergeh (U,U); 
855 
V0 = vec_mergeh (V,V); 
856  
857 
U1 = vec_mergel (U,U); 
858 
V1 = vec_mergel (V,V); 
859  
860 
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 
861 
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 
862  
863 
R = vec_packclp (R0,R1); 
864 
G = vec_packclp (G0,G1); 
865 
B = vec_packclp (B0,B1); 
866  
867 
out_rgba (R,G,B,out); 
868 
} 
869  
870 
if (i < dstW) {

871 
i = 16;

872  
873 
Y0 = RND; 
874 
Y1 = RND; 
875 
/* extract 16 coeffs from lumSrc */

876 
for(j=0; j<lumFilterSize; j++) { 
877 
X0 = vec_ld (0, &lumSrc[j][i]);

878 
X1 = vec_ld (16, &lumSrc[j][i]);

879 
Y0 = vec_mradds (X0, YCoeffs[j], Y0); 
880 
Y1 = vec_mradds (X1, YCoeffs[j], Y1); 
881 
} 
882  
883 
U = RND; 
884 
V = RND; 
885 
/* extract 8 coeffs from U,V */

886 
for(j=0; j<chrFilterSize; j++) { 
887 
X = vec_ld (0, &chrSrc[j][i/2]); 
888 
U = vec_mradds (X, CCoeffs[j], U); 
889 
X = vec_ld (0, &chrSrc[j][i/2+2048]); 
890 
V = vec_mradds (X, CCoeffs[j], V); 
891 
} 
892  
893 
/* scale and clip signals */

894 
Y0 = vec_sra (Y0, SCL); 
895 
Y1 = vec_sra (Y1, SCL); 
896 
U = vec_sra (U, SCL); 
897 
V = vec_sra (V, SCL); 
898  
899 
Y0 = vec_clip (Y0); 
900 
Y1 = vec_clip (Y1); 
901 
U = vec_clip (U); 
902 
V = vec_clip (V); 
903  
904 
/* now we have

905 
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15

906 
U= u0 u1 u2 u3 u4 u5 u6 u7 V= v0 v1 v2 v3 v4 v5 v6 v7

907 

908 
Y0= y0 y1 y2 y3 y4 y5 y6 y7 Y1= y8 y9 y10 y11 y12 y13 y14 y15

909 
U0= u0 u0 u1 u1 u2 u2 u3 u3 U1= u4 u4 u5 u5 u6 u6 u7 u7

910 
V0= v0 v0 v1 v1 v2 v2 v3 v3 V1= v4 v4 v5 v5 v6 v6 v7 v7

911 
*/

912  
913 
U0 = vec_mergeh (U,U); 
914 
V0 = vec_mergeh (V,V); 
915  
916 
U1 = vec_mergel (U,U); 
917 
V1 = vec_mergel (V,V); 
918  
919 
cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0); 
920 
cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1); 
921  
922 
R = vec_packclp (R0,R1); 
923 
G = vec_packclp (G0,G1); 
924 
B = vec_packclp (B0,B1); 
925  
926 
nout = (vector unsigned char *)scratch; 
927 
out_rgba (R,G,B,nout); 
928  
929 
memcpy (&((uint32_t*)dest)[i], scratch, (dstWi)/4);

930 
} 
931  
932 
if (vYCoeffsBank) free (vYCoeffsBank);

933 
if (vCCoeffsBank) free (vCCoeffsBank);

934  
935 
} 