Statistics
| Branch: | Revision:

ffmpeg / postproc / yuv2rgb_altivec.c @ 3845b56d

History | View | Annotate | Download (29.6 KB)

1
/*
2
  marc.hoffman@analog.com    March 8, 2004
3

4
  Altivec Acceleration for Color Space Conversion revision 0.2
5

6
  convert I420 YV12 to RGB in various formats,
7
    it rejects images that are not in 420 formats
8
    it rejects images that don't have widths of multiples of 16
9
    it rejects images that don't have heights of multiples of 2
10
  reject defers to C simulation codes.
11

12
  lots of optimizations to be done here
13

14
  1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15
     so we currently use max min to clip
16

17
  2. the inefficient use of chroma loading needs a bit of brushing up
18

19
  3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20

21

22
  MODIFIED to calculate coeffs from currently selected color space.
23
  MODIFIED core to be a macro which you spec the output format.
24
  ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25
  CORRECTED algorithim selection to be strict on input formats.
26
  ADDED runtime detection of altivec.
27

28
  ADDED altivec_yuv2packedX vertical scl + RGB converter
29

30
  March 27,2004
31
  PERFORMANCE ANALYSIS
32

33
  The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34
  The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35

36
  720*480*30  ~10MPS
37

38
  so we have roughly 10clocks per pixel this is too high something has to be wrong.
39

40
  OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41

42
  OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43
  guaranteed to have the input video frame it was just decompressed so
44
  it probably resides in L1 caches.  However we are creating the
45
  output video stream this needs to use the DSTST instruction to
46
  optimize for the cache.  We couple this with the fact that we are
47
  not going to be visiting the input buffer again so we mark it Least
48
  Recently Used.  This shaves 25% of the processor cycles off.
49

50
  Now MEMCPY is the largest mips consumer in the system, probably due
51
  to the inefficient X11 stuff.
52

53
  GL libraries seem to be very slow on this machine 1.33Ghz PB running
54
  Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55
  a versioning issues, however i have libGL.1.2.dylib for both
56
  machines. ((We need to figure this out now))
57

58
  GL2 libraries work now with patch for RGB32
59

60
  NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61

62
  Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. 
63

64
*/
65
#include <stdio.h>
66
#include <stdlib.h>
67
#include <string.h>
68
#include <inttypes.h>
69
#include <assert.h>
70
#include "config.h"
71
#ifdef HAVE_MALLOC_H
72
#include <malloc.h>
73
#endif
74
#include "rgb2rgb.h"
75
#include "swscale.h"
76
#include "swscale_internal.h"
77
#include "mangle.h"
78
#include "libvo/img_format.h" //FIXME try to reduce dependency of such stuff
79

    
80
#undef PROFILE_THE_BEAST
81
#undef INC_SCALING
82

    
83
typedef unsigned char ubyte;
84
typedef signed char   sbyte;
85

    
86

    
87
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
88
   homogeneous vector registers x0,x1,x2 are interleaved with the
89
   following technique:
90

91
      o0 = vec_mergeh (x0,x1);
92
      o1 = vec_perm (o0, x2, perm_rgb_0);
93
      o2 = vec_perm (o0, x2, perm_rgb_1);
94
      o3 = vec_mergel (x0,x1);
95
      o4 = vec_perm (o3,o2,perm_rgb_2);
96
      o5 = vec_perm (o3,o2,perm_rgb_3);
97

98
  perm_rgb_0:   o0(RG).h v1(B) --> o1*
99
              0   1  2   3   4
100
             rgbr|gbrg|brgb|rgbr
101
             0010 0100 1001 0010
102
             0102 3145 2673 894A
103

104
  perm_rgb_1:   o0(RG).h v1(B) --> o2
105
              0   1  2   3   4
106
             gbrg|brgb|bbbb|bbbb
107
             0100 1001 1111 1111
108
             B5CD 6EF7 89AB CDEF
109

110
  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
111
              0   1  2   3   4
112
             gbrg|brgb|rgbr|gbrg
113
             1111 1111 0010 0100
114
             89AB CDEF 0182 3945
115

116
  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
117
              0   1  2   3   4
118
             brgb|rgbr|gbrg|brgb
119
             1001 0010 0100 1001
120
             a67b 89cA BdCD eEFf
121

122
*/
123
static
124
const vector unsigned char
125
  perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
126
                                      0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
127
  perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
128
                                      0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
129
  perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
130
                                      0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
131
  perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
132
                                      0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
133

    
134
#define vec_merge3(x2,x1,x0,y0,y1,y2)    \
135
do {                                         \
136
  typeof(x0) o0,o2,o3;                         \
137
      o0 = vec_mergeh (x0,x1);                 \
138
      y0 = vec_perm (o0, x2, perm_rgb_0);\
139
      o2 = vec_perm (o0, x2, perm_rgb_1);\
140
      o3 = vec_mergel (x0,x1);                 \
141
      y1 = vec_perm (o3,o2,perm_rgb_2);         \
142
      y2 = vec_perm (o3,o2,perm_rgb_3);         \
143
} while(0)
144

    
145
#define vec_mstrgb24(x0,x1,x2,ptr)        \
146
do {                                         \
147
  typeof(x0) _0,_1,_2;                         \
148
  vec_merge3 (x0,x1,x2,_0,_1,_2);         \
149
  vec_st (_0, 0, ptr++);                 \
150
  vec_st (_1, 0, ptr++);                 \
151
  vec_st (_2, 0, ptr++);                 \
152
}  while (0);
153

    
154
#define vec_mstbgr24(x0,x1,x2,ptr)       \
155
do {                                         \
156
  typeof(x0) _0,_1,_2;                         \
157
  vec_merge3 (x2,x1,x0,_0,_1,_2);         \
158
  vec_st (_0, 0, ptr++);                 \
159
  vec_st (_1, 0, ptr++);                 \
160
  vec_st (_2, 0, ptr++);                 \
161
}  while (0);
162

    
163
/* pack the pixels in rgb0 format
164
   msb R
165
   lsb 0
166
*/
167
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                                       \
168
do {                                                                                       \
169
  T _0,_1,_2,_3;                                                                       \
170
  _0 = vec_mergeh (x0,x1);                                                               \
171
  _1 = vec_mergeh (x2,x3);                                                                     \
172
  _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);            \
173
  _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);            \
174
  vec_st (_2, 0*16, (T *)ptr);                                                               \
175
  vec_st (_3, 1*16, (T *)ptr);                                                                      \
176
  _0 = vec_mergel (x0,x1);                                                               \
177
  _1 = vec_mergel (x2,x3);                                                                      \
178
  _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1);                \
179
  _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1);                \
180
  vec_st (_2, 2*16, (T *)ptr);                                                                      \
181
  vec_st (_3, 3*16, (T *)ptr);                                                                      \
182
  ptr += 4;                                                                               \
183
}  while (0);
184

    
185
/*
186

187
  | 1     0       1.4021   | | Y |
188
  | 1    -0.3441 -0.7142   |x| Cb|
189
  | 1     1.7718  0           | | Cr|
190

191

192
  Y:      [-128 127]
193
  Cb/Cr : [-128 127]
194

195
  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
196

197
*/
198

    
199

    
200

    
201

    
202
#define vec_unh(x) \
203
  (vector signed short) \
204
    vec_perm(x,(typeof(x))AVV(0),\
205
             (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
206
                                    0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
207
#define vec_unl(x) \
208
  (vector signed short) \
209
    vec_perm(x,(typeof(x))AVV(0),\
210
             (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
211
                                    0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
212

    
213
#define vec_clip(x) \
214
  vec_max (vec_min (x, (typeof(x))AVV(235)), (typeof(x))AVV(16))
215

    
216
#define vec_packclp_a(x,y) \
217
  (vector unsigned char)vec_pack (vec_clip (x), vec_clip (y))
218

    
219
#define vec_packclp(x,y) \
220
  (vector unsigned char)vec_packs \
221
      ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
222
       (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
223

    
224
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
225

    
226

    
227
static inline void cvtyuvtoRGB (SwsContext *c,
228
                           vector signed short Y, vector signed short U, vector signed short V,
229
                           vector signed short *R, vector signed short *G, vector signed short *B)
230
{
231
  vector signed   short vx,ux,uvx;
232

    
233
  Y = vec_mradds (Y, c->CY, c->OY);
234
  U  = vec_sub (U,(vector signed short)
235
                          vec_splat((vector signed short)AVV(128),0));
236
  V  = vec_sub (V,(vector signed short)
237
                          vec_splat((vector signed short)AVV(128),0));
238

    
239
  //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
240
  ux = vec_sl (U, c->CSHIFT);
241
  *B = vec_mradds (ux, c->CBU, Y);
242

    
243
  // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
244
  vx = vec_sl (V, c->CSHIFT);
245
  *R = vec_mradds (vx, c->CRV, Y);
246

    
247
  // uvx = ((CGU*u) + (CGV*v))>>15;
248
  uvx = vec_mradds (U, c->CGU, Y);
249
  *G = vec_mradds (V, c->CGV, uvx);
250
}
251

    
252

    
253
/*
254
  ------------------------------------------------------------------------------
255
  CS converters
256
  ------------------------------------------------------------------------------
257
*/
258

    
259

    
260
#define DEFCSP420_CVT(name,out_pixels)                                     \
261
static int altivec_##name (SwsContext *c,                                  \
262
                                unsigned char **in, int *instrides,           \
263
                                int srcSliceY,        int srcSliceH,                   \
264
                                unsigned char **oplanes, int *outstrides)  \
265
{                                                                           \
266
  int w = c->srcW;                                                           \
267
  int h = srcSliceH;                                                           \
268
  int i,j;                                                                   \
269
  int instrides_scl[3];                                                           \
270
  vector unsigned char y0,y1;                                                   \
271
                                                                           \
272
  vector signed char  u,v;                                                   \
273
                                                                           \
274
  vector signed short Y0,Y1,Y2,Y3;                                           \
275
  vector signed short U,V;                                                   \
276
  vector signed short vx,ux,uvx;                                           \
277
  vector signed short vx0,ux0,uvx0;                                           \
278
  vector signed short vx1,ux1,uvx1;                                           \
279
  vector signed short R0,G0,B0;                                                   \
280
  vector signed short R1,G1,B1;                                                   \
281
  vector unsigned char R,G,B;                                                   \
282
                                                                           \
283
  vector unsigned char *uivP, *vivP;                                              \
284
  vector unsigned char align_perm;                                           \
285
                                                                           \
286
  vector signed short                                                            \
287
    lCY  = c->CY,                                                           \
288
    lOY  = c->OY,                                                           \
289
    lCRV = c->CRV,                                                           \
290
    lCBU = c->CBU,                                                           \
291
    lCGU = c->CGU,                                                           \
292
    lCGV = c->CGV;                                                           \
293
                                                                           \
294
  vector unsigned short lCSHIFT = c->CSHIFT;                                   \
295
                                                                           \
296
  ubyte *y1i   = in[0];                                                           \
297
  ubyte *y2i   = in[0]+w;                                                   \
298
  ubyte *ui    = in[1];                                                           \
299
  ubyte *vi    = in[2];                                                           \
300
                                                                           \
301
  vector unsigned char *oute                                                   \
302
    = (vector unsigned char *)                                                   \
303
        (oplanes[0]+srcSliceY*outstrides[0]);                                   \
304
  vector unsigned char *outo                                                   \
305
    = (vector unsigned char *)                                                   \
306
        (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                   \
307
                                                                           \
308
                                                                           \
309
  instrides_scl[0] = instrides[0];                                           \
310
  instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */           \
311
  instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */           \
312
                                                                           \
313
                                                                           \
314
  for (i=0;i<h/2;i++) {                                                           \
315
    vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 \
316
    vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 \
317
                                                                           \
318
    for (j=0;j<w/16;j++) {                                                   \
319
                                                                           \
320
      y0 = vec_ldl (0,y1i);                                                   \
321
      y1 = vec_ldl (0,y2i);                                                   \
322
      uivP = (vector unsigned char *)ui;                                   \
323
      vivP = (vector unsigned char *)vi;                                   \
324
                                                                           \
325
      align_perm = vec_lvsl (0, ui);                                           \
326
      u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);           \
327
                                                                           \
328
      align_perm = vec_lvsl (0, vi);                                           \
329
      v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);           \
330
                                                                           \
331
      u  = (vector signed char)                                                   \
332
                     vec_sub (u,(vector signed char)                            \
333
                                vec_splat((vector signed char)AVV(128),0));\
334
      v  = (vector signed char)                                                   \
335
                     vec_sub (v,(vector signed char)                                   \
336
                                vec_splat((vector signed char)AVV(128),0));\
337
                                                                           \
338
      U  = vec_unpackh (u);                                                   \
339
      V  = vec_unpackh (v);                                                   \
340
                                                                           \
341
                                                                           \
342
        Y0 = vec_unh (y0);                                                   \
343
        Y1 = vec_unl (y0);                                                   \
344
        Y2 = vec_unh (y1);                                                   \
345
        Y3 = vec_unl (y1);                                                   \
346
                                                                           \
347
        Y0 = vec_mradds (Y0, lCY, lOY);                                           \
348
        Y1 = vec_mradds (Y1, lCY, lOY);                                           \
349
        Y2 = vec_mradds (Y2, lCY, lOY);                                           \
350
        Y3 = vec_mradds (Y3, lCY, lOY);                                           \
351
                                                                           \
352
        /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                           \
353
        ux = vec_sl (U, lCSHIFT);                                           \
354
        ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));                   \
355
        ux0  = vec_mergeh (ux,ux);                                           \
356
        ux1  = vec_mergel (ux,ux);                                           \
357
                                                                           \
358
        /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */                           \
359
        vx = vec_sl (V, lCSHIFT);                                           \
360
        vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));                   \
361
        vx0  = vec_mergeh (vx,vx);                                           \
362
        vx1  = vec_mergel (vx,vx);                                           \
363
                                                                           \
364
        /* uvx = ((CGU*u) + (CGV*v))>>15 */                                   \
365
        uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));                   \
366
        uvx = vec_mradds (V, lCGV, uvx);                                   \
367
        uvx0 = vec_mergeh (uvx,uvx);                                           \
368
        uvx1 = vec_mergel (uvx,uvx);                                           \
369
                                                                           \
370
        R0 = vec_add (Y0,vx0);                                                   \
371
        G0 = vec_add (Y0,uvx0);                                                   \
372
        B0 = vec_add (Y0,ux0);                                                   \
373
        R1 = vec_add (Y1,vx1);                                                   \
374
        G1 = vec_add (Y1,uvx1);                                                   \
375
        B1 = vec_add (Y1,ux1);                                                   \
376
                                                                           \
377
        R  = vec_packclp (R0,R1);                                           \
378
        G  = vec_packclp (G0,G1);                                           \
379
        B  = vec_packclp (B0,B1);                                           \
380
                                                                           \
381
        out_pixels(R,G,B,oute);                                                   \
382
                                                                           \
383
        R0 = vec_add (Y2,vx0);                                                   \
384
        G0 = vec_add (Y2,uvx0);                                                   \
385
        B0 = vec_add (Y2,ux0);                                                   \
386
        R1 = vec_add (Y3,vx1);                                                   \
387
        G1 = vec_add (Y3,uvx1);                                                   \
388
        B1 = vec_add (Y3,ux1);                                                   \
389
        R  = vec_packclp (R0,R1);                                           \
390
        G  = vec_packclp (G0,G1);                                           \
391
        B  = vec_packclp (B0,B1);                                           \
392
                                                                           \
393
                                                                           \
394
        out_pixels(R,G,B,outo);                                                   \
395
                                                                           \
396
      y1i  += 16;                                                           \
397
      y2i  += 16;                                                           \
398
      ui   += 8;                                                           \
399
      vi   += 8;                                                           \
400
                                                                           \
401
    }                                                                           \
402
                                                                           \
403
    outo += (outstrides[0])>>4;                                                   \
404
    oute += (outstrides[0])>>4;                                                   \
405
                                                                           \
406
    ui    += instrides_scl[1];                                                   \
407
    vi    += instrides_scl[2];                                                   \
408
    y1i   += instrides_scl[0];                                                   \
409
    y2i   += instrides_scl[0];                                                   \
410
  }                                                                           \
411
  return srcSliceH;                                                           \
412
}
413

    
414

    
415
#define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
416
#define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
417
#define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
418
#define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
419
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
420
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(c,b,a,ptr)
421

    
422
DEFCSP420_CVT (yuv2_abgr32, out_abgr)
423
#if 1
424
DEFCSP420_CVT (yuv2_bgra32, out_argb)
425
#else
426
static int altivec_yuv2_bgra32 (SwsContext *c,                                  
427
                                unsigned char **in, int *instrides,           
428
                                int srcSliceY,        int srcSliceH,                   
429
                                unsigned char **oplanes, int *outstrides)  
430
{                                                                           
431
  int w = c->srcW;                                                           
432
  int h = srcSliceH;                                                           
433
  int i,j;                                                                   
434
  int instrides_scl[3];                                                           
435
  vector unsigned char y0,y1;                                                   
436
                                                                           
437
  vector signed char  u,v;                                                   
438
                                                                           
439
  vector signed short Y0,Y1,Y2,Y3;                                           
440
  vector signed short U,V;                                                   
441
  vector signed short vx,ux,uvx;                                           
442
  vector signed short vx0,ux0,uvx0;                                           
443
  vector signed short vx1,ux1,uvx1;                                           
444
  vector signed short R0,G0,B0;                                                   
445
  vector signed short R1,G1,B1;                                                   
446
  vector unsigned char R,G,B;                                                   
447
                                                                           
448
  vector unsigned char *uivP, *vivP;                                              
449
  vector unsigned char align_perm;                                           
450
                                                                           
451
  vector signed short                                                            
452
    lCY  = c->CY,                                                           
453
    lOY  = c->OY,                                                           
454
    lCRV = c->CRV,                                                           
455
    lCBU = c->CBU,                                                           
456
    lCGU = c->CGU,                                                           
457
    lCGV = c->CGV;                                                           
458
                                                                           
459
  vector unsigned short lCSHIFT = c->CSHIFT;                                   
460
                                                                           
461
  ubyte *y1i   = in[0];                                                           
462
  ubyte *y2i   = in[0]+w;                                                   
463
  ubyte *ui    = in[1];                                                           
464
  ubyte *vi    = in[2];                                                           
465
                                                                           
466
  vector unsigned char *oute                                                   
467
    = (vector unsigned char *)                                                   
468
        (oplanes[0]+srcSliceY*outstrides[0]);                                   
469
  vector unsigned char *outo                                                   
470
    = (vector unsigned char *)                                                   
471
        (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);                   
472
                                                                           
473
                                                                           
474
  instrides_scl[0] = instrides[0];                                           
475
  instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */           
476
  instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */           
477
                                                                           
478
                                                                           
479
  for (i=0;i<h/2;i++) {                                                           
480
    vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);                 
481
    vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);                 
482
                                                                           
483
    for (j=0;j<w/16;j++) {                                                   
484
                                                                           
485
      y0 = vec_ldl (0,y1i);                                                   
486
      y1 = vec_ldl (0,y2i);                                                   
487
      uivP = (vector unsigned char *)ui;                                   
488
      vivP = (vector unsigned char *)vi;                                   
489
                                                                           
490
      align_perm = vec_lvsl (0, ui);                                           
491
      u = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);           
492
                                                                           
493
      align_perm = vec_lvsl (0, vi);                                           
494
      v = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
495
      u  = (vector signed char)
496
                     vec_sub (u,(vector signed char)
497
                                vec_splat((vector signed char)AVV(128),0));
498
      
499
      v  = (vector signed char)
500
                      vec_sub (v, (vector signed char)
501
                                vec_splat((vector signed char)AVV(128),0));
502
      
503
      U  = vec_unpackh (u);                                                   
504
      V  = vec_unpackh (v);                                                   
505
                                                                           
506
                                                                           
507
        Y0 = vec_unh (y0);                                                   
508
        Y1 = vec_unl (y0);                                                   
509
        Y2 = vec_unh (y1);                                                   
510
        Y3 = vec_unl (y1);                                                   
511
                                                                           
512
        Y0 = vec_mradds (Y0, lCY, lOY);                                           
513
        Y1 = vec_mradds (Y1, lCY, lOY);                                           
514
        Y2 = vec_mradds (Y2, lCY, lOY);                                           
515
        Y3 = vec_mradds (Y3, lCY, lOY);                                           
516
                                                                           
517
        /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                           
518
        ux = vec_sl (U, lCSHIFT);                                           
519
        ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
520
        ux0  = vec_mergeh (ux,ux);                                           
521
        ux1  = vec_mergel (ux,ux);                                           
522
                                                                           
523
        /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */                           
524
        vx = vec_sl (V, lCSHIFT);                                           
525
        vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
526
        vx0  = vec_mergeh (vx,vx);
527
        vx1  = vec_mergel (vx,vx);
528
        /* uvx = ((CGU*u) + (CGV*v))>>15 */
529
        uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
530
        uvx = vec_mradds (V, lCGV, uvx);
531
        uvx0 = vec_mergeh (uvx,uvx);
532
        uvx1 = vec_mergel (uvx,uvx);
533
        R0 = vec_add (Y0,vx0);
534
        G0 = vec_add (Y0,uvx0);
535
        B0 = vec_add (Y0,ux0);
536
        R1 = vec_add (Y1,vx1);
537
        G1 = vec_add (Y1,uvx1);
538
        B1 = vec_add (Y1,ux1);
539
        R  = vec_packclp (R0,R1);
540
        G  = vec_packclp (G0,G1);
541
        B  = vec_packclp (B0,B1);
542
        
543
        out_argb(R,G,B,oute);
544
        R0 = vec_add (Y2,vx0);
545
        G0 = vec_add (Y2,uvx0);
546
        B0 = vec_add (Y2,ux0);
547
        R1 = vec_add (Y3,vx1);
548
        G1 = vec_add (Y3,uvx1);
549
        B1 = vec_add (Y3,ux1);
550
        R  = vec_packclp (R0,R1);
551
        G  = vec_packclp (G0,G1);
552
        B  = vec_packclp (B0,B1);
553
        
554
        out_argb(R,G,B,outo);
555
        y1i  += 16;                                                           
556
        y2i  += 16;                                                           
557
        ui   += 8;
558
        vi   += 8;                                                           
559
                                                                           
560
    }                                                                           
561
                                                                           
562
    outo += (outstrides[0])>>4;                                                   
563
    oute += (outstrides[0])>>4;                                                   
564
                                                                           
565
    ui    += instrides_scl[1];                                                   
566
    vi    += instrides_scl[2];                                                   
567
    y1i   += instrides_scl[0];                                                   
568
    y2i   += instrides_scl[0];                                                   
569
  }                                                                           
570
  return srcSliceH;                                                           
571
}
572

    
573
#endif
574

    
575

    
576
DEFCSP420_CVT (yuv2_rgba32, out_rgba)
577
DEFCSP420_CVT (yuv2_argb32, out_argb)
578
DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
579
DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
580

    
581

    
582
// uyvy|uyvy|uyvy|uyvy
583
// 0123 4567 89ab cdef
584
static
585
const vector unsigned char
586
  demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
587
                                   0x10,0x04,0x10,0x04,
588
                                   0x10,0x08,0x10,0x08,
589
                                   0x10,0x0c,0x10,0x0c),
590
  demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
591
                                   0x10,0x06,0x10,0x06,
592
                                   0x10,0x0A,0x10,0x0A,
593
                                   0x10,0x0E,0x10,0x0E),
594
  demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
595
                                   0x10,0x05,0x10,0x07,
596
                                   0x10,0x09,0x10,0x0B,
597
                                   0x10,0x0D,0x10,0x0F);
598

    
599
/*
600
  this is so I can play live CCIR raw video
601
*/
602
static int altivec_uyvy_rgb32 (SwsContext *c,
603
                               unsigned char **in, int *instrides,
604
                               int srcSliceY,        int srcSliceH,
605
                               unsigned char **oplanes, int *outstrides)
606
{
607
  int w = c->srcW;
608
  int h = srcSliceH;
609
  int i,j;
610
  vector unsigned char uyvy;
611
  vector signed   short Y,U,V;
612
  vector signed   short vx,ux,uvx;
613
  vector signed   short R0,G0,B0,R1,G1,B1;
614
  vector unsigned char  R,G,B;
615
  vector unsigned char *out;
616
  ubyte *img;
617

    
618
  img = in[0];
619
  out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
620

    
621
  for (i=0;i<h;i++) {
622
    for (j=0;j<w/16;j++) {
623
      uyvy = vec_ld (0, img);
624
      U = (vector signed short)
625
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
626

    
627
      V = (vector signed short)
628
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
629

    
630
      Y = (vector signed short)
631
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
632

    
633
      cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
634

    
635
      uyvy = vec_ld (16, img);
636
      U = (vector signed short)
637
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
638

    
639
      V = (vector signed short)
640
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
641

    
642
      Y = (vector signed short)
643
        vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
644

    
645
      cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
646

    
647
      R  = vec_packclp (R0,R1);
648
      G  = vec_packclp (G0,G1);
649
      B  = vec_packclp (B0,B1);
650

    
651
      //      vec_mstbgr24 (R,G,B, out);
652
      out_rgba (R,G,B,out);
653

    
654
      img += 32;
655
    }
656
  }
657
  return srcSliceH;
658
}
659

    
660

    
661

    
662
/* Ok currently the acceleration routine only supports
663
   inputs of widths a multiple of 16
664
   and heights a multiple 2
665

666
   So we just fall back to the C codes for this.
667
*/
668
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
669
{
670
  if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))    
671
    return NULL;
672

    
673
  /*
674
    and this seems not to matter too much I tried a bunch of 
675
    videos with abnormal widths and mplayer crashes else where.
676
    mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv 
677
    boom with X11 bad match.
678
    
679
  */
680
  if ((c->srcW & 0xf) != 0)    return NULL;
681

    
682
  switch (c->srcFormat) {
683
  case IMGFMT_YVU9:
684
  case IMGFMT_IF09:
685
  case IMGFMT_YV12:
686
  case IMGFMT_I420:
687
  case IMGFMT_IYUV:
688
  case IMGFMT_CLPL:
689
  case IMGFMT_Y800:
690
  case IMGFMT_Y8:
691
  case IMGFMT_NV12:
692
  case IMGFMT_NV21:
693
    if ((c->srcH & 0x1) != 0)
694
      return NULL;
695

    
696
    switch(c->dstFormat){
697
    case IMGFMT_RGB24:
698
      MSG_WARN("ALTIVEC: Color Space RGB24\n");
699
      return altivec_yuv2_rgb24;
700
    case IMGFMT_BGR24:
701
      MSG_WARN("ALTIVEC: Color Space BGR24\n");
702
      return altivec_yuv2_bgr24;
703
    case IMGFMT_RGB32:
704
      MSG_WARN("ALTIVEC: Color Space ARGB32\n");
705
      return altivec_yuv2_argb32;
706
    case IMGFMT_BGR32:
707
      MSG_WARN("ALTIVEC: Color Space BGRA32\n");
708
      //      return profile_altivec_bgra32;
709

    
710
      return altivec_yuv2_bgra32;
711
    default: return NULL;
712
    }
713
    break;
714

    
715
  case IMGFMT_UYVY:
716
    switch(c->dstFormat){
717
    case IMGFMT_RGB32:
718
      MSG_WARN("ALTIVEC: Color Space UYVY -> RGB32\n");
719
      return altivec_uyvy_rgb32;
720
    default: return NULL;
721
    }
722
    break;
723

    
724
  }
725
  return NULL;
726
}
727

    
728
static uint16_t roundToInt16(int64_t f){
729
        int r= (f + (1<<15))>>16;
730
             if(r<-0x7FFF) return 0x8000;
731
        else if(r> 0x7FFF) return 0x7FFF;
732
        else               return r;
733
}
734

    
735
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
736
{
737
  union {
738
          signed short tmp[8] __attribute__ ((aligned(16)));
739
        vector signed short vec;
740
        } buf;
741

    
742
  buf.tmp[0] =  ( (0xffffLL) * contrast>>8 )>>9;                        //cy
743
  buf.tmp[1] =  -256*brightness;                                        //oy
744
  buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);        //crv
745
  buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);        //cbu
746
  buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));        //cgu
747
  buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));        //cgv
748

    
749

    
750
  c->CSHIFT = (vector unsigned short)vec_splat((vector unsigned short)AVV(2),0);
751
  c->CY  = vec_splat ((vector signed short)buf.vec, 0);
752
  c->OY  = vec_splat ((vector signed short)buf.vec, 1);
753
  c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
754
  c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
755
  c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
756
  c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
757
#if 0
758
{
759
int i;
760
char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
761
for (i=0; i<6;i++)
762
  printf("%s %d ", v[i],buf.tmp[i] );
763
  printf("\n");
764
}
765
#endif
766
 return;
767
}
768

    
769

    
770
void
771
altivec_yuv2packedX (SwsContext *c,
772
                       int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
773
                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
774
                       uint8_t *dest, int dstW, int dstY)
775
{
776
  int i,j;
777
  short tmp __attribute__((aligned (16)));
778
  int16_t *p;
779
  short *f;
780
  vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
781
  vector signed short R0,G0,B0,R1,G1,B1;
782

    
783
  vector unsigned char R,G,B,pels[3];
784
  vector unsigned char *out,*nout;
785

    
786
  vector signed short   RND = vec_splat((vector signed short)AVV(1<<3),0);
787
  vector unsigned short SCL = vec_splat((vector unsigned short)AVV(4),0);
788
  unsigned long scratch[16] __attribute__ ((aligned (16)));
789

    
790
  vector signed short *vYCoeffsBank, *vCCoeffsBank;
791

    
792
  vector signed short *YCoeffs, *CCoeffs;
793

    
794
  vYCoeffsBank = memalign (16, sizeof (vector signed short)*lumFilterSize*c->dstH);
795
  vCCoeffsBank = memalign (16, sizeof (vector signed short)*chrFilterSize*c->dstH);
796

    
797
  for (i=0;i<lumFilterSize*c->dstH;i++) {
798
    tmp = c->vLumFilter[i];
799
    p = &vYCoeffsBank[i];
800
    for (j=0;j<8;j++)
801
      p[j] = tmp;
802
  }
803

    
804
  for (i=0;i<chrFilterSize*c->dstH;i++) {
805
    tmp = c->vChrFilter[i];
806
    p = &vCCoeffsBank[i];
807
    for (j=0;j<8;j++)
808
      p[j] = tmp;
809
  }
810

    
811
  YCoeffs = vYCoeffsBank+dstY*lumFilterSize;
812
  CCoeffs = vCCoeffsBank+dstY*chrFilterSize;
813

    
814
  out = (vector unsigned char *)dest;
815

    
816
  for(i=0; i<dstW; i+=16){
817
    Y0 = RND;
818
    Y1 = RND;
819
    /* extract 16 coeffs from lumSrc */
820
    for(j=0; j<lumFilterSize; j++) {
821
      X0 = vec_ld (0,  &lumSrc[j][i]);
822
      X1 = vec_ld (16, &lumSrc[j][i]);
823
      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
824
      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
825
    }
826

    
827
    U = RND;
828
    V = RND;
829
    /* extract 8 coeffs from U,V */
830
    for(j=0; j<chrFilterSize; j++) {
831
      X  = vec_ld (0, &chrSrc[j][i/2]);
832
      U  = vec_mradds (X, CCoeffs[j], U);
833
      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
834
      V  = vec_mradds (X, CCoeffs[j], V);
835
    }
836

    
837
    /* scale and clip signals */
838
    Y0 = vec_sra (Y0, SCL);
839
    Y1 = vec_sra (Y1, SCL);
840
    U  = vec_sra (U,  SCL);
841
    V  = vec_sra (V,  SCL);
842

    
843
    Y0 = vec_clip (Y0);
844
    Y1 = vec_clip (Y1);
845
    U  = vec_clip (U);
846
    V  = vec_clip (V);
847

    
848
    /* now we have
849
      Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
850
      U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
851

852
      Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
853
      U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
854
      V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
855
    */
856

    
857
    U0 = vec_mergeh (U,U);
858
    V0 = vec_mergeh (V,V);
859

    
860
    U1 = vec_mergel (U,U);
861
    V1 = vec_mergel (V,V);
862

    
863
    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
864
    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
865

    
866
    R  = vec_packclp (R0,R1);
867
    G  = vec_packclp (G0,G1);
868
    B  = vec_packclp (B0,B1);
869

    
870
    switch(c->dstFormat) {
871
      case IMGFMT_ABGR: out_abgr (R,G,B,out); break;
872
      case IMGFMT_BGRA: out_bgra (R,G,B,out); break;
873
      case IMGFMT_RGBA: out_rgba (R,G,B,out); break;
874
      case IMGFMT_ARGB: out_argb (R,G,B,out); break;
875
      case IMGFMT_RGB24: out_rgb24 (R,G,B,out); break;
876
      case IMGFMT_BGR24: out_bgr24 (R,G,B,out); break;
877
      default:
878
        {
879
          /* FIXME: either write more out_* macros or punt to yuv2packedXinC */
880
          static int printed_error_message;
881
          if(!printed_error_message) {
882
            MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
883
                    vo_format_name(c->dstFormat));
884
            printed_error_message=1;
885
          }
886
          return;
887
        }
888
    }
889
  }
890

    
891
  if (i < dstW) {
892
    i -= 16;
893

    
894
    Y0 = RND;
895
    Y1 = RND;
896
    /* extract 16 coeffs from lumSrc */
897
    for(j=0; j<lumFilterSize; j++) {
898
      X0 = vec_ld (0,  &lumSrc[j][i]);
899
      X1 = vec_ld (16, &lumSrc[j][i]);
900
      Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901
      Y1 = vec_mradds (X1, YCoeffs[j], Y1);
902
    }
903

    
904
    U = RND;
905
    V = RND;
906
    /* extract 8 coeffs from U,V */
907
    for(j=0; j<chrFilterSize; j++) {
908
      X  = vec_ld (0, &chrSrc[j][i/2]);
909
      U  = vec_mradds (X, CCoeffs[j], U);
910
      X  = vec_ld (0, &chrSrc[j][i/2+2048]);
911
      V  = vec_mradds (X, CCoeffs[j], V);
912
    }
913

    
914
    /* scale and clip signals */
915
    Y0 = vec_sra (Y0, SCL);
916
    Y1 = vec_sra (Y1, SCL);
917
    U  = vec_sra (U,  SCL);
918
    V  = vec_sra (V,  SCL);
919

    
920
    Y0 = vec_clip (Y0);
921
    Y1 = vec_clip (Y1);
922
    U  = vec_clip (U);
923
    V  = vec_clip (V);
924

    
925
    /* now we have
926
       Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
927
       U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
928

929
       Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
930
       U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
931
       V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
932
    */
933

    
934
    U0 = vec_mergeh (U,U);
935
    V0 = vec_mergeh (V,V);
936

    
937
    U1 = vec_mergel (U,U);
938
    V1 = vec_mergel (V,V);
939

    
940
    cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941
    cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
942

    
943
    R  = vec_packclp (R0,R1);
944
    G  = vec_packclp (G0,G1);
945
    B  = vec_packclp (B0,B1);
946

    
947
    nout = (vector unsigned char *)scratch;
948
    switch(c->dstFormat) {
949
      case IMGFMT_ABGR: out_abgr (R,G,B,nout); break;
950
      case IMGFMT_BGRA: out_bgra (R,G,B,nout); break;
951
      case IMGFMT_RGBA: out_rgba (R,G,B,nout); break;
952
      case IMGFMT_ARGB: out_argb (R,G,B,nout); break;
953
      case IMGFMT_RGB24: out_rgb24 (R,G,B,nout); break;
954
      case IMGFMT_BGR24: out_bgr24 (R,G,B,nout); break;
955
      default:
956
        /* Unreachable, I think. */
957
        MSG_ERR("altivec_yuv2packedX doesn't support %s output\n",
958
                vo_format_name(c->dstFormat));
959
        return;
960
    }
961

    
962
    memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
963
  }
964

    
965
  if (vYCoeffsBank) free (vYCoeffsBank);
966
  if (vCCoeffsBank) free (vCCoeffsBank);
967

    
968
}