Statistics
| Branch: | Revision:

ffmpeg / postproc / yuv2rgb_altivec.c @ 582552fb

History | View | Annotate | Download (28.3 KB)

1
/*
2
  marc.hoffman@analog.com    March 8, 2004
3

4
  Altivec Acceleration for Color Space Conversion revision 0.2
5

6
  convert I420 YV12 to RGB in various formats,
7
    it rejects images that are not in 420 formats
8
    it rejects images that don't have widths of multiples of 16
9
    it rejects images that don't have heights of multiples of 2
10
  reject defers to C simulation codes.
11

12
  lots of optimizations to be done here
13

14
  1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15
     so we currently use max min to clip
16

17
  2. the inefficient use of chroma loading needs a bit of brushing up
18

19
  3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20

21

22
  MODIFIED to calculate coeffs from currently selected color space.
23
  MODIFIED core to be a macro which you spec the output format.
24
  ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25
  CORRECTED algorithim selection to be strict on input formats.
26
  ADDED runtime detection of altivec.
27

28
  ADDED altivec_yuv2packedX vertical scl + RGB converter
29

30
  March 27,2004
31
  PERFORMANCE ANALYSIS
32

33
  The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34
  The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35

36
  720*480*30  ~10MPS
37

38
  so we have roughly 10clocks per pixel this is too high something has to be wrong.
39

40
  OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41

42
  OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43
  guaranteed to have the input video frame it was just decompressed so
44
  it probably resides in L1 caches.  However we are creating the
45
  output video stream this needs to use the DSTST instruction to
46
  optimize for the cache.  We couple this with the fact that we are
47
  not going to be visiting the input buffer again so we mark it Least
48
  Recently Used.  This shaves 25% of the processor cycles off.
49

50
  Now MEMCPY is the largest mips consumer in the system, probably due
51
  to the inefficient X11 stuff.
52

53
  GL libraries seem to be very slow on this machine 1.33Ghz PB running
54
  Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55
  a versioning issues, however i have libGL.1.2.dylib for both
56
  machines. ((We need to figure this out now))
57

58
  GL2 libraries work now with patch for RGB32
59

60
  NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61

62
  Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63

64
*/
65
#include <stdio.h>
66
#include <stdlib.h>
67
#include <string.h>
68
#include <inttypes.h>
69
#include <assert.h>
70
#include "config.h"
71
#include "rgb2rgb.h"
72
#include "swscale.h"
73
#include "swscale_internal.h"
74
#include "../mangle.h"
75
#include "../libvo/img_format.h" //FIXME try to reduce dependency of such stuff
76

    
77
#undef PROFILE_THE_BEAST
78
#undef INC_SCALING
79

    
80
typedef unsigned char ubyte;
81
typedef signed char   sbyte;
82

    
83

    
84
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
85
   homogeneous vector registers x0,x1,x2 are interleaved with the
86
   following technique:
87

88
      o0 = vec_mergeh (x0,x1);
89
      o1 = vec_perm (o0, x2, perm_rgb_0);
90
      o2 = vec_perm (o0, x2, perm_rgb_1);
91
      o3 = vec_mergel (x0,x1);
92
      o4 = vec_perm (o3,o2,perm_rgb_2);
93
      o5 = vec_perm (o3,o2,perm_rgb_3);
94

95
  perm_rgb_0:   o0(RG).h v1(B) --> o1*
96
              0   1  2   3   4
97
             rgbr|gbrg|brgb|rgbr
98
             0010 0100 1001 0010
99
             0102 3145 2673 894A
100

101
  perm_rgb_1:   o0(RG).h v1(B) --> o2
102
              0   1  2   3   4
103
             gbrg|brgb|bbbb|bbbb
104
             0100 1001 1111 1111
105
             B5CD 6EF7 89AB CDEF
106

107
  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
108
              0   1  2   3   4
109
             gbrg|brgb|rgbr|gbrg
110
             1111 1111 0010 0100
111
             89AB CDEF 0182 3945
112

113
  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
114
              0   1  2   3   4
115
             brgb|rgbr|gbrg|brgb
116
             1001 0010 0100 1001
117
             a67b 89cA BdCD eEFf
118

119
*/
120
static
121
const vector unsigned char
122
  perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
123
                                      0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
124
  perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
125
                                      0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
126
  perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
127
                                      0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
128
  perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
129
                                      0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
130

    
131
#define vec_merge3(x2,x1,x0,y0,y1,y2)    \
132
do {                                         \
133
  typeof(x0) o0,o2,o3;                         \
134
      o0 = vec_mergeh (x0,x1);                 \
135
      y0 = vec_perm (o0, x2, perm_rgb_0);\
136
      o2 = vec_perm (o0, x2, perm_rgb_1);\
137
      o3 = vec_mergel (x0,x1);                 \
138
      y1 = vec_perm (o3,o2,perm_rgb_2);         \
139
      y2 = vec_perm (o3,o2,perm_rgb_3);         \
140
} while(0)
141

    
142
#define vec_mstrgb24(x0,x1,x2,ptr)        \
143
do {                                         \
144
  typeof(x0) _0,_1,_2;                         \
145
  vec_merge3 (x0,x1,x2,_0,_1,_2);         \
146
  vec_st (_0, 0, ptr++);                 \
147
  vec_st (_1, 0, ptr++);                 \
148
  vec_st (_2, 0, ptr++);                 \
149
}  while (0);
150

    
151
#define vec_mstbgr24(x0,x1,x2,ptr)       \
152
do {                                         \
153
  typeof(x0) _0,_1,_2;                         \
154
  vec_merge3 (x2,x1,x0,_0,_1,_2);         \
155
  vec_st (_0, 0, ptr++);                 \
156
  vec_st (_1, 0, ptr++);                 \
157
  vec_st (_2, 0, ptr++);                 \
158
}  while (0);
159

    
160
/* pack the pixels in rgb0 format
161
   msb R
162
   lsb 0
163
*/
164
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                       \
165
do {                                                                                       \
166
  T _0,_1,_2,_3;                                                                       \
167
  _0 = vec_mergeh (x0,x1);                                                               \
168
  _1 = vec_mergeh (x2,x3);                                                                     \
169
  _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
170
  _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
171
  vec_st (_2, 0*16, (T *)ptr);                                                               \
172
  vec_st (_3, 1*16, (T *)ptr);                                                                      \
173
  _0 = vec_mergel (x0,x1);                                                               \
174
  _1 = vec_mergel (x2,x3);                                                                      \
175
  _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);                \
176
  _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);                \
177
  vec_st (_2, 2*16, (T *)ptr);                                                                      \
178
  vec_st (_3, 3*16, (T *)ptr);                                                                      \
179
  ptr += 4;                                                                               \
180
}  while (0);
181

    
182
/*
183

184
  | 1     0       1.4021   | | Y |
185
  | 1    -0.3441 -0.7142   |x| Cb|
186
  | 1     1.7718  0           | | Cr|
187

188

189
  Y:      [-128 127]
190
  Cb/Cr : [-128 127]
191

192
  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
193

194
*/
195

    
196

    
197

    
198

    
199
#define vec_unh(x) \
200
  (vector signed short) \
201
    vec_perm(x,(typeof(x))AVV(0),\
202
             (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
203
                                    0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
204
#define vec_unl(x) \
205
  (vector signed short) \
206
    vec_perm(x,(typeof(x))AVV(0),\
207
             (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
208
                                    0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
209

    
210
#define vec_clip(x) \
211
  vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
212

    
213
#define vec_packclp_a(x,y) \
214
  (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
215

    
216
#define vec_packclp(x,y) \
217
  (vector unsigned char)vec_packs \
218
      ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
219
       (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
220

    
221
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
222

    
223

    
224
static inline void cvtyuvtoRGB (SwsContext *c,
225
                           vector signed short Y, vector signed short U, vector signed short V,
226
                           vector signed short *R, vector signed short *G, vector signed short *B)
227
{
228
  vector signed   short vx,ux,uvx;
229

    
230
  Y = vec_mradds (Y, c->CY, c->OY);
231
  U  = vec_sub (U,(vector signed short)
232
                          vec_splat((vector signed short)AVV(128),0));
233
  V  = vec_sub (V,(vector signed short)
234
                          vec_splat((vector signed short)AVV(128),0));
235

    
236
  //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
237
  ux = vec_sl (U, c->CSHIFT);
238
  *B = vec_mradds (ux, c->CBU, Y);
239

    
240
  // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
241
  vx = vec_sl (V, c->CSHIFT);
242
  *R = vec_mradds (vx, c->CRV, Y);
243

    
244
  // uvx = ((CGU*u) + (CGV*v))>>15;
245
  uvx = vec_mradds (U, c->CGU, Y);
246
  *G = vec_mradds (V, c->CGV, uvx);
247
}
248

    
249

    
250
/*
251
  ------------------------------------------------------------------------------
252
  CS converters
253
  ------------------------------------------------------------------------------
254
*/
255

    
256

    
257
#define DEFCSP420_CVT(name,out_pixels)                                     \
258
static int altivec_##name (SwsContext *c,                                  \
259
                                unsigned char **in, int *instrides,           \
260
                                int srcSliceY,        int srcSliceH,                   \
261
                                unsigned char **oplanes, int *outstrides)  \
262
{                                                                           \
263
  int w = c->srcW;                                                           \
264
  int h = srcSliceH;                                                           \
265
  int i,j;                                                                   \
266
  int instrides_scl[3];                                                           \
267
  vector unsigned char y0,y1;                                                   \
268
                                                                           \
269
  vector signed char  u,v;                                                   \
270
                                                                           \
271
  vector signed short Y0,Y1,Y2,Y3;                                           \
272
  vector signed short U,V;                                                   \
273
  vector signed short vx,ux,uvx;                                           \
274
  vector signed short vx0,ux0,uvx0;                                           \
275
  vector signed short vx1,ux1,uvx1;                                           \
276
  vector signed short R0,G0,B0;                                                   \
277
  vector signed short R1,G1,B1;                                                   \
278
  vector unsigned char R,G,B;                                                   \
279
                                                                           \
280
  vector unsigned char *uivP, *vivP;                                              \
281
  vector unsigned char align_perm;                                           \
282
                                                                           \
283
  vector signed short                                                            \
284
    lCY  = c->CY,                                                           \
285
    lOY  = c->OY,                                                           \
286
    lCRV = c->CRV,                                                           \
287
    lCBU = c->CBU,                                                           \
288
    lCGU = c->CGU,                                                           \
289
    lCGV = c->CGV;                                                           \
290
                                                                           \
291
  vector unsigned short lCSHIFT = c->CSHIFT;                                   \
292
                                                                           \
293
  ubyte *y1i   = in[0];                                                           \
294
  ubyte *y2i   = in[0]+w;                                                   \
295
  ubyte *ui    = in[1];                                                           \
296
  ubyte *vi    = in[2];                                                           \
297
                                                                           \
298
  vector unsigned char *oute                                                   \
299
    = (vector unsigned char *)                                                   \
300
        (oplanes[0]+srcSliceY*outstrides[0]);                                   \
301
  vector unsigned char *outo                                                   \
302
    = (vector unsigned char *)                                                   \
303
        (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                   \
304
                                                                           \
305
                                                                           \
306
  instrides_scl[0] = instrides[0];                                           \
307
  instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */           \
308
  instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */           \
309
                                                                           \
310
                                                                           \
311
  for (i=0;i<h/2;i++) {                                                           \
312
    vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
313
    vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
314
                                                                           \
315
    for (j=0;j<w/16;j++) {                                                   \
316
                                                                           \
317
      y0 = vec_ldl (0,y1i);                                                   \
318
      y1 = vec_ldl (0,y2i);                                                   \
319
      uivP = (vector unsigned char *)ui;                                   \
320
      vivP = (vector unsigned char *)vi;                                   \
321
                                                                           \
322
      align_perm = vec_lvsl (0, ui);                                           \
323
      u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);           \
324
                                                                           \
325
      align_perm = vec_lvsl (0, vi);                                           \
326
      v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);           \
327
                                                                           \
328
      u  = (vector signed char)                                                   \
329
                     vec_sub (u,(vector signed char)                            \
330
                                vec_splat((vector signed char)AVV(128),0));\
331
      v  = (vector signed char)                                                   \
332
                     vec_sub (v,(vector signed char)                                   \
333
                                vec_splat((vector signed char)AVV(128),0));\
334
                                                                           \
335
      U  = vec_unpackh (u);                                                   \
336
      V  = vec_unpackh (v);                                                   \
337
                                                                           \
338
                                                                           \
339
        Y0 = vec_unh (y0);                                                   \
340
        Y1 = vec_unl (y0);                                                   \
341
        Y2 = vec_unh (y1);                                                   \
342
        Y3 = vec_unl (y1);                                                   \
343
                                                                           \
344
        Y0 = vec_mradds (Y0, lCY, lOY);                                           \
345
        Y1 = vec_mradds (Y1, lCY, lOY);                                           \
346
        Y2 = vec_mradds (Y2, lCY, lOY);                                           \
347
        Y3 = vec_mradds (Y3, lCY, lOY);                                           \
348
                                                                           \
349
        /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                           \
350
        ux = vec_sl (U, lCSHIFT);                                           \
351
        ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
352
        ux0  = vec_mergeh (ux,ux);                                           \
353
        ux1  = vec_mergel (ux,ux);                                           \
354
                                                                           \
355
        /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */                           \
356
        vx = vec_sl (V, lCSHIFT);                                           \
357
        vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
358
        vx0  = vec_mergeh (vx,vx);                                           \
359
        vx1  = vec_mergel (vx,vx);                                           \
360
                                                                           \
361
        /* uvx = ((CGU*u) + (CGV*v))>>15 */                                   \
362
        uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
363
        uvx = vec_mradds (V, lCGV, uvx);                                   \
364
        uvx0 = vec_mergeh (uvx,uvx);                                           \
365
        uvx1 = vec_mergel (uvx,uvx);                                           \
366
                                                                           \
367
        R0 = vec_add (Y0,vx0);                                                   \
368
        G0 = vec_add (Y0,uvx0);                                                   \
369
        B0 = vec_add (Y0,ux0);                                                   \
370
        R1 = vec_add (Y1,vx1);                                                   \
371
        G1 = vec_add (Y1,uvx1);                                                   \
372
        B1 = vec_add (Y1,ux1);                                                   \
373
                                                                           \
374
        R  = vec_packclp (R0,R1);                                           \
375
        G  = vec_packclp (G0,G1);                                           \
376
        B  = vec_packclp (B0,B1);                                           \
377
                                                                           \
378
        out_pixels(R,G,B,oute);                                                   \
379
                                                                           \
380
        R0 = vec_add (Y2,vx0);                                                   \
381
        G0 = vec_add (Y2,uvx0);                                                   \
382
        B0 = vec_add (Y2,ux0);                                                   \
383
        R1 = vec_add (Y3,vx1);                                                   \
384
        G1 = vec_add (Y3,uvx1);                                                   \
385
        B1 = vec_add (Y3,ux1);                                                   \
386
        R  = vec_packclp (R0,R1);                                           \
387
        G  = vec_packclp (G0,G1);                                           \
388
        B  = vec_packclp (B0,B1);                                           \
389
                                                                           \
390
                                                                           \
391
        out_pixels(R,G,B,outo);                                                   \
392
                                                                           \
393
      y1i  += 16;                                                           \
394
      y2i  += 16;                                                           \
395
      ui   += 8;                                                           \
396
      vi   += 8;                                                           \
397
                                                                           \
398
    }                                                                           \
399
                                                                           \
400
    outo += (outstrides[0])>>4;                                                   \
401
    oute += (outstrides[0])>>4;                                                   \
402
                                                                           \
403
    ui    += instrides_scl[1];                                                   \
404
    vi    += instrides_scl[2];                                                   \
405
    y1i   += instrides_scl[0];                                                   \
406
    y2i   += instrides_scl[0];                                                   \
407
  }                                                                           \
408
  return srcSliceH;                                                           \
409
}
410

    
411

    
412
#define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
413
#define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
414
#define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
415
#define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
416
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
417
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
418

    
419
DEFCSP420_CVT (yuv2_abgr32, out_abgr)
420
#if 1
421
DEFCSP420_CVT (yuv2_bgra32, out_argb)
422
#else
423
static int altivec_yuv2_bgra32 (SwsContext *c,                                  
424
                                unsigned char **in, int *instrides,           
425
                                int srcSliceY,        int srcSliceH,                   
426
                                unsigned char **oplanes, int *outstrides)  
427
{                                                                           
428
  int w = c->srcW;                                                           
429
  int h = srcSliceH;                                                           
430
  int i,j;                                                                   
431
  int instrides_scl[3];                                                           
432
  vector unsigned char y0,y1;                                                   
433
                                                                           
434
  vector signed char  u,v;                                                   
435
                                                                           
436
  vector signed short Y0,Y1,Y2,Y3;                                           
437
  vector signed short U,V;                                                   
438
  vector signed short vx,ux,uvx;                                           
439
  vector signed short vx0,ux0,uvx0;                                           
440
  vector signed short vx1,ux1,uvx1;                                           
441
  vector signed short R0,G0,B0;                                                   
442
  vector signed short R1,G1,B1;                                                   
443
  vector unsigned char R,G,B;                                                   
444
                                                                           
445
  vector unsigned char *uivP, *vivP;                                              
446
  vector unsigned char align_perm;                                           
447
                                                                           
448
  vector signed short                                                            
449
    lCY  = c->CY,                                                           
450
    lOY  = c->OY,                                                           
451
    lCRV = c->CRV,                                                           
452
    lCBU = c->CBU,                                                           
453
    lCGU = c->CGU,                                                           
454
    lCGV = c->CGV;                                                           
455
                                                                           
456
  vector unsigned short lCSHIFT = c->CSHIFT;                                   
457
                                                                           
458
  ubyte *y1i   = in[0];                                                           
459
  ubyte *y2i   = in[0]+w;                                                   
460
  ubyte *ui    = in[1];                                                           
461
  ubyte *vi    = in[2];                                                           
462
                                                                           
463
  vector unsigned char *oute                                                   
464
    = (vector unsigned char *)                                                   
465
        (oplanes[0]+srcSliceY*outstrides[0]);                                   
466
  vector unsigned char *outo                                                   
467
    = (vector unsigned char *)                                                   
468
        (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                   
469
                                                                           
470
                                                                           
471
  instrides_scl[0] = instrides[0];                                           
472
  instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */           
473
  instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */           
474
                                                                           
475
                                                                           
476
  for (i=0;i<h/2;i++) {                                                           
477
    vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 
478
    vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 
479
                                                                           
480
    for (j=0;j<w/16;j++) {                                                   
481
                                                                           
482
      y0 = vec_ldl (0,y1i);                                                   
483
      y1 = vec_ldl (0,y2i);                                                   
484
      uivP = (vector unsigned char *)ui;                                   
485
      vivP = (vector unsigned char *)vi;                                   
486
                                                                           
487
      align_perm = vec_lvsl (0, ui);                                           
488
      u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);           
489
                                                                           
490
      align_perm = vec_lvsl (0, vi);                                           
491
      v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
492
      u  = (vector signed char)
493
                     vec_sub (u,(vector signed char)
494
                                vec_splat((vector signed char)AVV(128),0));
495
      
496
      v  = (vector signed char)
497
                      vec_sub (v, (vector signed char)
498
                                vec_splat((vector signed char)AVV(128),0));
499
      
500
      U  = vec_unpackh (u);                                                   
501
      V  = vec_unpackh (v);                                                   
502
                                                                           
503
                                                                           
504
        Y0 = vec_unh (y0);                                                   
505
        Y1 = vec_unl (y0);                                                   
506
        Y2 = vec_unh (y1);                                                   
507
        Y3 = vec_unl (y1);                                                   
508
                                                                           
509
        Y0 = vec_mradds (Y0, lCY, lOY);                                           
510
        Y1 = vec_mradds (Y1, lCY, lOY);                                           
511
        Y2 = vec_mradds (Y2, lCY, lOY);                                           
512
        Y3 = vec_mradds (Y3, lCY, lOY);                                           
513
                                                                           
514
        /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                           
515
        ux = vec_sl (U, lCSHIFT);                                           
516
        ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
517
        ux0  = vec_mergeh (ux,ux);                                           
518
        ux1  = vec_mergel (ux,ux);                                           
519
                                                                           
520
        /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */                           
521
        vx = vec_sl (V, lCSHIFT);                                           
522
        vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
523
        vx0  = vec_mergeh (vx,vx);
524
        vx1  = vec_mergel (vx,vx);
525
        /* uvx = ((CGU*u) + (CGV*v))>>15 */
526
        uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
527
        uvx = vec_mradds (V, lCGV, uvx);
528
        uvx0 = vec_mergeh (uvx,uvx);
529
        uvx1 = vec_mergel (uvx,uvx);
530
        R0 = vec_add (Y0,vx0);
531
        G0 = vec_add (Y0,uvx0);
532
        B0 = vec_add (Y0,ux0);
533
        R1 = vec_add (Y1,vx1);
534
        G1 = vec_add (Y1,uvx1);
535
        B1 = vec_add (Y1,ux1);
536
        R  = vec_packclp (R0,R1);
537
        G  = vec_packclp (G0,G1);
538
        B  = vec_packclp (B0,B1);
539
        
540
        out_argb(R,G,B,oute);
541
        R0 = vec_add (Y2,vx0);
542
        G0 = vec_add (Y2,uvx0);
543
        B0 = vec_add (Y2,ux0);
544
        R1 = vec_add (Y3,vx1);
545
        G1 = vec_add (Y3,uvx1);
546
        B1 = vec_add (Y3,ux1);
547
        R  = vec_packclp (R0,R1);
548
        G  = vec_packclp (G0,G1);
549
        B  = vec_packclp (B0,B1);
550
        
551
        out_argb(R,G,B,outo);
552
        y1i  += 16;                                                           
553
        y2i  += 16;                                                           
554
        ui   += 8;
555
        vi   += 8;                                                           
556
                                                                           
557
    }                                                                           
558
                                                                           
559
    outo += (outstrides[0])>>4;                                                   
560
    oute += (outstrides[0])>>4;                                                   
561
                                                                           
562
    ui    += instrides_scl[1];                                                   
563
    vi    += instrides_scl[2];                                                   
564
    y1i   += instrides_scl[0];                                                   
565
    y2i   += instrides_scl[0];                                                   
566
  }                                                                           
567
  return srcSliceH;                                                           
568
}
569

    
570
#endif
571

    
572

    
573
DEFCSP420_CVT (yuv2_rgba32, out_rgba)
574
DEFCSP420_CVT (yuv2_argb32, out_argb)
575
DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
576
DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
577

    
578

    
579
// uyvy|uyvy|uyvy|uyvy
580
// 0123 4567 89ab cdef
581
static
582
const vector unsigned char
583
  demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
584
                                   0x10,0x04,0x10,0x04,
585
                                   0x10,0x08,0x10,0x08,
586
                                   0x10,0x0c,0x10,0x0c),
587
  demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
588
                                   0x10,0x06,0x10,0x06,
589
                                   0x10,0x0A,0x10,0x0A,
590
                                   0x10,0x0E,0x10,0x0E),
591
  demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
592
                                   0x10,0x05,0x10,0x07,
593
                                   0x10,0x09,0x10,0x0B,
594
                                   0x10,0x0D,0x10,0x0F);
595

    
596
/*
597
  this is so I can play live CCIR raw video
598
*/
599
static int altivec_uyvy_rgb32 (SwsContext *c,
600
                               unsigned char **in, int *instrides,
601
                               int srcSliceY,        int srcSliceH,
602
                               unsigned char **oplanes, int *outstrides)
603
{
604
  int w = c->srcW;
605
  int h = srcSliceH;
606
  int i,j;
607
  vector unsigned char uyvy;
608
  vector signed   short Y,U,V;
609
  vector signed   short vx,ux,uvx;
610
  vector signed   short R0,G0,B0,R1,G1,B1;
611
  vector unsigned char  R,G,B;
612
  vector unsigned char *out;
613
  ubyte *img;
614

    
615
  img = in[0];
616
  out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
617

    
618
  for (i=0;i<h;i++) {
619
    for (j=0;j<w/16;j++) {
620
      uyvy = vec_ld (0, img);
621
      U = (vector signed short)
622
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
623

    
624
      V = (vector signed short)
625
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
626

    
627
      Y = (vector signed short)
628
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
629

    
630
      cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
631

    
632
      uyvy = vec_ld (16, img);
633
      U = (vector signed short)
634
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
635

    
636
      V = (vector signed short)
637
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
638

    
639
      Y = (vector signed short)
640
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
641

    
642
      cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
643

    
644
      R  = vec_packclp (R0,R1);
645
      G  = vec_packclp (G0,G1);
646
      B  = vec_packclp (B0,B1);
647

    
648
      //      vec_mstbgr24 (R,G,B, out);
649
      out_rgba (R,G,B,out);
650

    
651
      img += 32;
652
    }
653
  }
654
  return srcSliceH;
655
}
656

    
657

    
658

    
659
/* Ok currently the acceleration routine only supports
660
   inputs of widths a multiple of 16
661
   and heights a multiple 2
662

663
   So we just fall back to the C codes for this.
664
*/
665
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
666
{
667
  if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
668
    return NULL;
669

    
670
  /*
671
    and this seems not to matter too much I tried a bunch of 
672
    videos with abnormal widths and mplayer crashes else where.
673
    mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
674
    boom with X11 bad match.
675
    
676
  */
677
  if ((c->srcW & 0xf) != 0)    return NULL;
678

    
679
  switch (c->srcFormat) {
680
  case IMGFMT_YVU9:
681
  case IMGFMT_IF09:
682
  case IMGFMT_YV12:
683
  case IMGFMT_I420:
684
  case IMGFMT_IYUV:
685
  case IMGFMT_CLPL:
686
  case IMGFMT_Y800:
687
  case IMGFMT_Y8:
688
  case IMGFMT_NV12:
689
  case IMGFMT_NV21:
690
    if ((c->srcH & 0x1) != 0)
691
      return NULL;
692

    
693
    switch(c->dstFormat){
694
    case IMGFMT_RGB24:
695
      MSG_WARN("ALTIVEC: Color Space RGB24\n");
696
      return altivec_yuv2_rgb24;
697
    case IMGFMT_BGR24:
698
      MSG_WARN("ALTIVEC: Color Space BGR24\n");
699
      return altivec_yuv2_bgr24;
700
    case IMGFMT_RGB32:
701
      MSG_WARN("ALTIVEC: Color Space ARGB32\n");
702
      return altivec_yuv2_argb32;
703
    case IMGFMT_BGR32:
704
      MSG_WARN("ALTIVEC: Color Space BGRA32\n");
705
      //      return profile_altivec_bgra32;
706

    
707
      return altivec_yuv2_bgra32;
708
    default: return NULL;
709
    }
710
    break;
711

    
712
  case IMGFMT_UYVY:
713
    switch(c->dstFormat){
714
    case IMGFMT_RGB32:
715
      MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
716
      return altivec_uyvy_rgb32;
717
    default: return NULL;
718
    }
719
    break;
720

    
721
  }
722
  return NULL;
723
}
724

    
725
static uint16_t roundToInt16(int64_t f){
726
        int r= (f + (1<<15))>>16;
727
             if(r<-0x7FFF) return 0x8000;
728
        else if(r> 0x7FFF) return 0x7FFF;
729
        else               return r;
730
}
731

    
732
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
733
{
734
  union {
735
          signed short tmp[8] __attribute__ ((aligned(16)));
736
        vector signed short vec;
737
        } buf;
738

    
739
  buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
740
  buf.tmp[1] =  -256*brightness;                                        //oy
741
  buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);        //crv
742
  buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);        //cbu
743
  buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));        //cgu
744
  buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));        //cgv
745

    
746

    
747
  c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
748
  c->CY  = vec_splat ((vector signed short)buf.vec, 0);
749
  c->OY  = vec_splat ((vector signed short)buf.vec, 1);
750
  c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
751
  c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
752
  c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
753
  c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
754
#if 0
755
{
756
int i;
757
char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
758
for (i=0; i<6;i++)
759
  printf("%s %d ", v[i],buf.tmp[i] );
760
  printf("\n");
761
}
762
#endif
763
 return;
764
}
765

    
766

    
767
void
768
altivec_yuv2packedX (SwsContext *c,
769
                       int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
770
                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
771
                       uint8_t *dest, int dstW, int dstY)
772
{
773
  int i,j;
774
  short tmp __attribute__((aligned (16)));
775
  int16_t *p;
776
  short *f;
777
  vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
778
  vector signed short R0,G0,B0,R1,G1,B1;
779

    
780
  vector unsigned char R,G,B,pels[3];
781
  vector unsigned char *out,*nout;
782

    
783
  vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0);
784
  vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
785
  unsigned long scratch[16] __attribute__ ((aligned (16)));
786

    
787
  vector signed short *vYCoeffsBank, *vCCoeffsBank;
788

    
789
  vector signed short *YCoeffs, *CCoeffs;
790

    
791
  vYCoeffsBank = malloc (sizeof (vector signed short)*lumFilterSize*dstW);
792
  vCCoeffsBank = malloc (sizeof (vector signed short)*chrFilterSize*dstW);
793

    
794
  for (i=0;i<lumFilterSize*dstW;i++) {
795
    tmp = c->vLumFilter[i];
796
    p = &vYCoeffsBank[i];
797
    for (j=0;j<8;j++)
798
      p[j] = tmp;
799
  }
800

    
801
  for (i=0;i<chrFilterSize*dstW;i++) {
802
    tmp = c->vChrFilter[i];
803
    p = &vCCoeffsBank[i];
804
    for (j=0;j<8;j++)
805
      p[j] = tmp;
806
  }
807

    
808
  YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
809
  CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
810

    
811
  out = (vector unsigned char *)dest;
812

    
813
  for(i=0; i<dstW; i+=16){
814
    Y0 = RND;
815
    Y1 = RND;
816
    /* extract 16 coeffs from lumSrc */
817
    for(j=0; j<lumFilterSize; j++) {
818
      X0 = vec_ld (0,  &lumSrc[j][i]);
819
      X1 = vec_ld (16, &lumSrc[j][i]);
820
      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
821
      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
822
    }
823

    
824
    U = RND;
825
    V = RND;
826
    /* extract 8 coeffs from U,V */
827
    for(j=0; j<chrFilterSize; j++) {
828
      X  = vec_ld (0, &chrSrc[j][i/2]);
829
      U  = vec_mradds (X, CCoeffs[j], U);
830
      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
831
      V  = vec_mradds (X, CCoeffs[j], V);
832
    }
833

    
834
    /* scale and clip signals */
835
    Y0 = vec_sra (Y0, SCL);
836
    Y1 = vec_sra (Y1, SCL);
837
    U  = vec_sra (U,  SCL);
838
    V  = vec_sra (V,  SCL);
839

    
840
    Y0 = vec_clip (Y0);
841
    Y1 = vec_clip (Y1);
842
    U  = vec_clip (U);
843
    V  = vec_clip (V);
844

    
845
    /* now we have
846
      Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
847
      U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
848

849
      Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
850
      U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
851
      V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
852
    */
853

    
854
    U0 = vec_mergeh (U,U);
855
    V0 = vec_mergeh (V,V);
856

    
857
    U1 = vec_mergel (U,U);
858
    V1 = vec_mergel (V,V);
859

    
860
    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
861
    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
862

    
863
    R  = vec_packclp (R0,R1);
864
    G  = vec_packclp (G0,G1);
865
    B  = vec_packclp (B0,B1);
866

    
867
    out_rgba (R,G,B,out);
868
  }
869

    
870
  if (i < dstW) {
871
    i -= 16;
872

    
873
    Y0 = RND;
874
    Y1 = RND;
875
    /* extract 16 coeffs from lumSrc */
876
    for(j=0; j<lumFilterSize; j++) {
877
      X0 = vec_ld (0,  &lumSrc[j][i]);
878
      X1 = vec_ld (16, &lumSrc[j][i]);
879
      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
880
      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
881
    }
882

    
883
    U = RND;
884
    V = RND;
885
    /* extract 8 coeffs from U,V */
886
    for(j=0; j<chrFilterSize; j++) {
887
      X  = vec_ld (0, &chrSrc[j][i/2]);
888
      U  = vec_mradds (X, CCoeffs[j], U);
889
      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
890
      V  = vec_mradds (X, CCoeffs[j], V);
891
    }
892

    
893
    /* scale and clip signals */
894
    Y0 = vec_sra (Y0, SCL);
895
    Y1 = vec_sra (Y1, SCL);
896
    U  = vec_sra (U,  SCL);
897
    V  = vec_sra (V,  SCL);
898

    
899
    Y0 = vec_clip (Y0);
900
    Y1 = vec_clip (Y1);
901
    U  = vec_clip (U);
902
    V  = vec_clip (V);
903

    
904
    /* now we have
905
       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
906
       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
907

908
       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
909
       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
910
       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
911
    */
912

    
913
    U0 = vec_mergeh (U,U);
914
    V0 = vec_mergeh (V,V);
915

    
916
    U1 = vec_mergel (U,U);
917
    V1 = vec_mergel (V,V);
918

    
919
    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
920
    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
921

    
922
    R  = vec_packclp (R0,R1);
923
    G  = vec_packclp (G0,G1);
924
    B  = vec_packclp (B0,B1);
925

    
926
    nout = (vector unsigned char *)scratch;
927
    out_rgba (R,G,B,nout);
928

    
929
    memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
930
  }
931

    
932
  if (vYCoeffsBank) free (vYCoeffsBank);
933
  if (vCCoeffsBank) free (vCCoeffsBank);
934

    
935
}