Statistics
| Branch: | Revision:

ffmpeg / libswscale / yuv2rgb_altivec.c @ a23c9c4a

History | View | Annotate | Download (38.6 KB)

1
/*
2
  marc.hoffman@analog.com    March 8, 2004
3

4
  AltiVec acceleration for colorspace conversion revision 0.2
5

6
  convert I420 YV12 to RGB in various formats,
7
    it rejects images that are not in 420 formats
8
    it rejects images that don't have widths of multiples of 16
9
    it rejects images that don't have heights of multiples of 2
10
  reject defers to C simulation codes.
11

12
  lots of optimizations to be done here
13

14
  1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
15
     so we currently use max min to clip
16

17
  2. the inefficient use of chroma loading needs a bit of brushing up
18

19
  3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
20

21

22
  MODIFIED to calculate coeffs from currently selected color space.
23
  MODIFIED core to be a macro which you spec the output format.
24
  ADDED UYVY conversion which is never called due to some thing in SWSCALE.
25
  CORRECTED algorithim selection to be strict on input formats.
26
  ADDED runtime detection of altivec.
27

28
  ADDED altivec_yuv2packedX vertical scl + RGB converter
29

30
  March 27,2004
31
  PERFORMANCE ANALYSIS
32

33
  The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
34
  The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
35

36
  720*480*30  ~10MPS
37

38
  so we have roughly 10clocks per pixel this is too high something has to be wrong.
39

40
  OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
41

42
  OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
43
  guaranteed to have the input video frame it was just decompressed so
44
  it probably resides in L1 caches.  However we are creating the
45
  output video stream this needs to use the DSTST instruction to
46
  optimize for the cache.  We couple this with the fact that we are
47
  not going to be visiting the input buffer again so we mark it Least
48
  Recently Used.  This shaves 25% of the processor cycles off.
49

50
  Now MEMCPY is the largest mips consumer in the system, probably due
51
  to the inefficient X11 stuff.
52

53
  GL libraries seem to be very slow on this machine 1.33Ghz PB running
54
  Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
55
  a versioning issues, however i have libGL.1.2.dylib for both
56
  machines. ((We need to figure this out now))
57

58
  GL2 libraries work now with patch for RGB32
59

60
  NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
61

62
  Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
63
*/
64

    
65
/*
66
 * This file is part of FFmpeg.
67
 *
68
 * FFmpeg is free software; you can redistribute it and/or modify
69
 * it under the terms of the GNU General Public License as published by
70
 * the Free Software Foundation; either version 2 of the License, or
71
 * (at your option) any later version.
72
 *
73
 * FFmpeg is distributed in the hope that it will be useful,
74
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
75
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
76
 * GNU General Public License for more details.
77
 *
78
 * You should have received a copy of the GNU General Public License
79
 * along with FFmpeg; if not, write to the Free Software
80
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
81
 */
82

    
83
#include <stdio.h>
84
#include <stdlib.h>
85
#include <string.h>
86
#include <inttypes.h>
87
#include <assert.h>
88
#include "config.h"
89
#ifdef HAVE_MALLOC_H
90
#include <malloc.h>
91
#endif
92
#include "rgb2rgb.h"
93
#include "swscale.h"
94
#include "swscale_internal.h"
95

    
96
#undef PROFILE_THE_BEAST
97
#undef INC_SCALING
98

    
99
typedef unsigned char ubyte;
100
typedef signed char   sbyte;
101

    
102

    
103
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
104
   homogeneous vector registers x0,x1,x2 are interleaved with the
105
   following technique:
106

107
      o0 = vec_mergeh (x0,x1);
108
      o1 = vec_perm (o0, x2, perm_rgb_0);
109
      o2 = vec_perm (o0, x2, perm_rgb_1);
110
      o3 = vec_mergel (x0,x1);
111
      o4 = vec_perm (o3,o2,perm_rgb_2);
112
      o5 = vec_perm (o3,o2,perm_rgb_3);
113

114
  perm_rgb_0:   o0(RG).h v1(B) --> o1*
115
              0   1  2   3   4
116
             rgbr|gbrg|brgb|rgbr
117
             0010 0100 1001 0010
118
             0102 3145 2673 894A
119

120
  perm_rgb_1:   o0(RG).h v1(B) --> o2
121
              0   1  2   3   4
122
             gbrg|brgb|bbbb|bbbb
123
             0100 1001 1111 1111
124
             B5CD 6EF7 89AB CDEF
125

126
  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
127
              0   1  2   3   4
128
             gbrg|brgb|rgbr|gbrg
129
             1111 1111 0010 0100
130
             89AB CDEF 0182 3945
131

132
  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
133
              0   1  2   3   4
134
             brgb|rgbr|gbrg|brgb
135
             1001 0010 0100 1001
136
             a67b 89cA BdCD eEFf
137

138
*/
139
static
140
const vector unsigned char
141
  perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142
                                               0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143
  perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144
                                               0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145
  perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146
                                               0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147
  perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148
                                               0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
149

    
150
#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
151
do {                                        \
152
    typeof(x0) o0,o2,o3;                    \
153
        o0 = vec_mergeh (x0,x1);            \
154
        y0 = vec_perm (o0, x2, perm_rgb_0); \
155
        o2 = vec_perm (o0, x2, perm_rgb_1); \
156
        o3 = vec_mergel (x0,x1);            \
157
        y1 = vec_perm (o3,o2,perm_rgb_2);   \
158
        y2 = vec_perm (o3,o2,perm_rgb_3);   \
159
} while(0)
160

    
161
#define vec_mstbgr24(x0,x1,x2,ptr)      \
162
do {                                    \
163
    typeof(x0) _0,_1,_2;                \
164
    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
165
    vec_st (_0, 0, ptr++);              \
166
    vec_st (_1, 0, ptr++);              \
167
    vec_st (_2, 0, ptr++);              \
168
}  while (0);
169

    
170
#define vec_mstrgb24(x0,x1,x2,ptr)      \
171
do {                                    \
172
    typeof(x0) _0,_1,_2;                \
173
    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
174
    vec_st (_0, 0, ptr++);              \
175
    vec_st (_1, 0, ptr++);              \
176
    vec_st (_2, 0, ptr++);              \
177
}  while (0);
178

    
179
/* pack the pixels in rgb0 format
180
   msb R
181
   lsb 0
182
*/
183
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
184
do {                                                                          \
185
    T _0,_1,_2,_3;                                                            \
186
    _0 = vec_mergeh (x0,x1);                                                  \
187
    _1 = vec_mergeh (x2,x3);                                                  \
188
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190
    vec_st (_2, 0*16, (T *)ptr);                                              \
191
    vec_st (_3, 1*16, (T *)ptr);                                              \
192
    _0 = vec_mergel (x0,x1);                                                  \
193
    _1 = vec_mergel (x2,x3);                                                  \
194
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196
    vec_st (_2, 2*16, (T *)ptr);                                              \
197
    vec_st (_3, 3*16, (T *)ptr);                                              \
198
    ptr += 4;                                                                 \
199
}  while (0);
200

    
201
/*
202

203
  | 1     0       1.4021   | | Y |
204
  | 1    -0.3441 -0.7142   |x| Cb|
205
  | 1     1.7718  0        | | Cr|
206

207

208
  Y:      [-128 127]
209
  Cb/Cr : [-128 127]
210

211
  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
212

213
*/
214

    
215

    
216

    
217

    
218
#define vec_unh(x) \
219
    (vector signed short) \
220
        vec_perm(x,(typeof(x))AVV(0),\
221
                 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222
                                           0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
223
#define vec_unl(x) \
224
    (vector signed short) \
225
        vec_perm(x,(typeof(x))AVV(0),\
226
                 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227
                                           0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
228

    
229
#define vec_clip_s16(x) \
230
    vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231
                         (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
232

    
233
#define vec_packclp(x,y) \
234
    (vector unsigned char)vec_packs \
235
        ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236
         (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
237

    
238
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
239

    
240

    
241
static inline void cvtyuvtoRGB (SwsContext *c,
242
                                vector signed short Y, vector signed short U, vector signed short V,
243
                                vector signed short *R, vector signed short *G, vector signed short *B)
244
{
245
    vector signed   short vx,ux,uvx;
246

    
247
    Y = vec_mradds (Y, c->CY, c->OY);
248
    U  = vec_sub (U,(vector signed short)
249
                    vec_splat((vector signed short)AVV(128),0));
250
    V  = vec_sub (V,(vector signed short)
251
                    vec_splat((vector signed short)AVV(128),0));
252

    
253
    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254
    ux = vec_sl (U, c->CSHIFT);
255
    *B = vec_mradds (ux, c->CBU, Y);
256

    
257
    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258
    vx = vec_sl (V, c->CSHIFT);
259
    *R = vec_mradds (vx, c->CRV, Y);
260

    
261
    // uvx = ((CGU*u) + (CGV*v))>>15;
262
    uvx = vec_mradds (U, c->CGU, Y);
263
    *G  = vec_mradds (V, c->CGV, uvx);
264
}
265

    
266

    
267
/*
268
  ------------------------------------------------------------------------------
269
  CS converters
270
  ------------------------------------------------------------------------------
271
*/
272

    
273

    
274
#define DEFCSP420_CVT(name,out_pixels)                                  \
275
static int altivec_##name (SwsContext *c,                               \
276
                           unsigned char **in, int *instrides,          \
277
                           int srcSliceY,        int srcSliceH,         \
278
                           unsigned char **oplanes, int *outstrides)    \
279
{                                                                       \
280
    int w = c->srcW;                                                    \
281
    int h = srcSliceH;                                                  \
282
    int i,j;                                                            \
283
    int instrides_scl[3];                                               \
284
    vector unsigned char y0,y1;                                         \
285
                                                                        \
286
    vector signed char  u,v;                                            \
287
                                                                        \
288
    vector signed short Y0,Y1,Y2,Y3;                                    \
289
    vector signed short U,V;                                            \
290
    vector signed short vx,ux,uvx;                                      \
291
    vector signed short vx0,ux0,uvx0;                                   \
292
    vector signed short vx1,ux1,uvx1;                                   \
293
    vector signed short R0,G0,B0;                                       \
294
    vector signed short R1,G1,B1;                                       \
295
    vector unsigned char R,G,B;                                         \
296
                                                                        \
297
    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
298
    vector unsigned char align_perm;                                    \
299
                                                                        \
300
    vector signed short                                                 \
301
        lCY  = c->CY,                                                   \
302
        lOY  = c->OY,                                                   \
303
        lCRV = c->CRV,                                                  \
304
        lCBU = c->CBU,                                                  \
305
        lCGU = c->CGU,                                                  \
306
        lCGV = c->CGV;                                                  \
307
                                                                        \
308
    vector unsigned short lCSHIFT = c->CSHIFT;                          \
309
                                                                        \
310
    ubyte *y1i   = in[0];                                               \
311
    ubyte *y2i   = in[0]+instrides[0];                                  \
312
    ubyte *ui    = in[1];                                               \
313
    ubyte *vi    = in[2];                                               \
314
                                                                        \
315
    vector unsigned char *oute                                          \
316
        = (vector unsigned char *)                                      \
317
            (oplanes[0]+srcSliceY*outstrides[0]);                       \
318
    vector unsigned char *outo                                          \
319
        = (vector unsigned char *)                                      \
320
            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
321
                                                                        \
322
                                                                        \
323
    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
324
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
325
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
326
                                                                        \
327
                                                                        \
328
    for (i=0;i<h/2;i++) {                                               \
329
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
330
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
331
                                                                        \
332
        for (j=0;j<w/16;j++) {                                          \
333
                                                                        \
334
            y1ivP = (vector unsigned char *)y1i;                        \
335
            y2ivP = (vector unsigned char *)y2i;                        \
336
            uivP  = (vector unsigned char *)ui;                         \
337
            vivP  = (vector unsigned char *)vi;                         \
338
                                                                        \
339
            align_perm = vec_lvsl (0, y1i);                             \
340
            y0 = (vector unsigned char)                                 \
341
                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
342
                                                                        \
343
            align_perm = vec_lvsl (0, y2i);                             \
344
            y1 = (vector unsigned char)                                 \
345
                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
346
                                                                        \
347
            align_perm = vec_lvsl (0, ui);                              \
348
            u = (vector signed char)                                    \
349
                vec_perm (uivP[0], uivP[1], align_perm);                \
350
                                                                        \
351
            align_perm = vec_lvsl (0, vi);                              \
352
            v = (vector signed char)                                    \
353
                vec_perm (vivP[0], vivP[1], align_perm);                \
354
                                                                        \
355
            u  = (vector signed char)                                   \
356
                 vec_sub (u,(vector signed char)                        \
357
                          vec_splat((vector signed char)AVV(128),0));   \
358
            v  = (vector signed char)                                   \
359
                 vec_sub (v,(vector signed char)                        \
360
                          vec_splat((vector signed char)AVV(128),0));   \
361
                                                                        \
362
            U  = vec_unpackh (u);                                       \
363
            V  = vec_unpackh (v);                                       \
364
                                                                        \
365
                                                                        \
366
            Y0 = vec_unh (y0);                                          \
367
            Y1 = vec_unl (y0);                                          \
368
            Y2 = vec_unh (y1);                                          \
369
            Y3 = vec_unl (y1);                                          \
370
                                                                        \
371
            Y0 = vec_mradds (Y0, lCY, lOY);                             \
372
            Y1 = vec_mradds (Y1, lCY, lOY);                             \
373
            Y2 = vec_mradds (Y2, lCY, lOY);                             \
374
            Y3 = vec_mradds (Y3, lCY, lOY);                             \
375
                                                                        \
376
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
377
            ux = vec_sl (U, lCSHIFT);                                   \
378
            ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));    \
379
            ux0  = vec_mergeh (ux,ux);                                  \
380
            ux1  = vec_mergel (ux,ux);                                  \
381
                                                                        \
382
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
383
            vx = vec_sl (V, lCSHIFT);                                   \
384
            vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));    \
385
            vx0  = vec_mergeh (vx,vx);                                  \
386
            vx1  = vec_mergel (vx,vx);                                  \
387
                                                                        \
388
            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
389
            uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));    \
390
            uvx = vec_mradds (V, lCGV, uvx);                            \
391
            uvx0 = vec_mergeh (uvx,uvx);                                \
392
            uvx1 = vec_mergel (uvx,uvx);                                \
393
                                                                        \
394
            R0 = vec_add (Y0,vx0);                                      \
395
            G0 = vec_add (Y0,uvx0);                                     \
396
            B0 = vec_add (Y0,ux0);                                      \
397
            R1 = vec_add (Y1,vx1);                                      \
398
            G1 = vec_add (Y1,uvx1);                                     \
399
            B1 = vec_add (Y1,ux1);                                      \
400
                                                                        \
401
            R  = vec_packclp (R0,R1);                                   \
402
            G  = vec_packclp (G0,G1);                                   \
403
            B  = vec_packclp (B0,B1);                                   \
404
                                                                        \
405
            out_pixels(R,G,B,oute);                                     \
406
                                                                        \
407
            R0 = vec_add (Y2,vx0);                                      \
408
            G0 = vec_add (Y2,uvx0);                                     \
409
            B0 = vec_add (Y2,ux0);                                      \
410
            R1 = vec_add (Y3,vx1);                                      \
411
            G1 = vec_add (Y3,uvx1);                                     \
412
            B1 = vec_add (Y3,ux1);                                      \
413
            R  = vec_packclp (R0,R1);                                   \
414
            G  = vec_packclp (G0,G1);                                   \
415
            B  = vec_packclp (B0,B1);                                   \
416
                                                                        \
417
                                                                        \
418
            out_pixels(R,G,B,outo);                                     \
419
                                                                        \
420
            y1i  += 16;                                                 \
421
            y2i  += 16;                                                 \
422
            ui   += 8;                                                  \
423
            vi   += 8;                                                  \
424
                                                                        \
425
        }                                                               \
426
                                                                        \
427
        outo  += (outstrides[0])>>4;                                    \
428
        oute  += (outstrides[0])>>4;                                    \
429
                                                                        \
430
        ui    += instrides_scl[1];                                      \
431
        vi    += instrides_scl[2];                                      \
432
        y1i   += instrides_scl[0];                                      \
433
        y2i   += instrides_scl[0];                                      \
434
    }                                                                   \
435
    return srcSliceH;                                                   \
436
}
437

    
438

    
439
#define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
440
#define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
441
#define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
442
#define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
443
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
444
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
445

    
446
DEFCSP420_CVT (yuv2_abgr, out_abgr)
447
#if 1
448
DEFCSP420_CVT (yuv2_bgra, out_bgra)
449
#else
450
static int altivec_yuv2_bgra32 (SwsContext *c,
451
                                unsigned char **in, int *instrides,
452
                                int srcSliceY,        int srcSliceH,
453
                                unsigned char **oplanes, int *outstrides)
454
{
455
    int w = c->srcW;
456
    int h = srcSliceH;
457
    int i,j;
458
    int instrides_scl[3];
459
    vector unsigned char y0,y1;
460

    
461
    vector signed char  u,v;
462

    
463
    vector signed short Y0,Y1,Y2,Y3;
464
    vector signed short U,V;
465
    vector signed short vx,ux,uvx;
466
    vector signed short vx0,ux0,uvx0;
467
    vector signed short vx1,ux1,uvx1;
468
    vector signed short R0,G0,B0;
469
    vector signed short R1,G1,B1;
470
    vector unsigned char R,G,B;
471

    
472
    vector unsigned char *uivP, *vivP;
473
    vector unsigned char align_perm;
474

    
475
    vector signed short
476
        lCY  = c->CY,
477
        lOY  = c->OY,
478
        lCRV = c->CRV,
479
        lCBU = c->CBU,
480
        lCGU = c->CGU,
481
        lCGV = c->CGV;
482

    
483
    vector unsigned short lCSHIFT = c->CSHIFT;
484

    
485
    ubyte *y1i   = in[0];
486
    ubyte *y2i   = in[0]+w;
487
    ubyte *ui    = in[1];
488
    ubyte *vi    = in[2];
489

    
490
    vector unsigned char *oute
491
        = (vector unsigned char *)
492
          (oplanes[0]+srcSliceY*outstrides[0]);
493
    vector unsigned char *outo
494
        = (vector unsigned char *)
495
          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
496

    
497

    
498
    instrides_scl[0] = instrides[0];
499
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
500
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
501

    
502

    
503
    for (i=0;i<h/2;i++) {
504
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
505
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
506

    
507
        for (j=0;j<w/16;j++) {
508

    
509
            y0 = vec_ldl (0,y1i);
510
            y1 = vec_ldl (0,y2i);
511
            uivP = (vector unsigned char *)ui;
512
            vivP = (vector unsigned char *)vi;
513

    
514
            align_perm = vec_lvsl (0, ui);
515
            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
516

    
517
            align_perm = vec_lvsl (0, vi);
518
            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
519
            u  = (vector signed char)
520
                 vec_sub (u,(vector signed char)
521
                          vec_splat((vector signed char)AVV(128),0));
522

    
523
            v  = (vector signed char)
524
                 vec_sub (v, (vector signed char)
525
                          vec_splat((vector signed char)AVV(128),0));
526

    
527
            U  = vec_unpackh (u);
528
            V  = vec_unpackh (v);
529

    
530

    
531
            Y0 = vec_unh (y0);
532
            Y1 = vec_unl (y0);
533
            Y2 = vec_unh (y1);
534
            Y3 = vec_unl (y1);
535

    
536
            Y0 = vec_mradds (Y0, lCY, lOY);
537
            Y1 = vec_mradds (Y1, lCY, lOY);
538
            Y2 = vec_mradds (Y2, lCY, lOY);
539
            Y3 = vec_mradds (Y3, lCY, lOY);
540

    
541
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
542
            ux = vec_sl (U, lCSHIFT);
543
            ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
544
            ux0  = vec_mergeh (ux,ux);
545
            ux1  = vec_mergel (ux,ux);
546

    
547
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
548
            vx = vec_sl (V, lCSHIFT);
549
            vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
550
            vx0  = vec_mergeh (vx,vx);
551
            vx1  = vec_mergel (vx,vx);
552
            /* uvx = ((CGU*u) + (CGV*v))>>15 */
553
            uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
554
            uvx = vec_mradds (V, lCGV, uvx);
555
            uvx0 = vec_mergeh (uvx,uvx);
556
            uvx1 = vec_mergel (uvx,uvx);
557
            R0 = vec_add (Y0,vx0);
558
            G0 = vec_add (Y0,uvx0);
559
            B0 = vec_add (Y0,ux0);
560
            R1 = vec_add (Y1,vx1);
561
            G1 = vec_add (Y1,uvx1);
562
            B1 = vec_add (Y1,ux1);
563
            R  = vec_packclp (R0,R1);
564
            G  = vec_packclp (G0,G1);
565
            B  = vec_packclp (B0,B1);
566

    
567
            out_argb(R,G,B,oute);
568
            R0 = vec_add (Y2,vx0);
569
            G0 = vec_add (Y2,uvx0);
570
            B0 = vec_add (Y2,ux0);
571
            R1 = vec_add (Y3,vx1);
572
            G1 = vec_add (Y3,uvx1);
573
            B1 = vec_add (Y3,ux1);
574
            R  = vec_packclp (R0,R1);
575
            G  = vec_packclp (G0,G1);
576
            B  = vec_packclp (B0,B1);
577

    
578
            out_argb(R,G,B,outo);
579
            y1i  += 16;
580
            y2i  += 16;
581
            ui   += 8;
582
            vi   += 8;
583

    
584
        }
585

    
586
        outo  += (outstrides[0])>>4;
587
        oute  += (outstrides[0])>>4;
588

    
589
        ui    += instrides_scl[1];
590
        vi    += instrides_scl[2];
591
        y1i   += instrides_scl[0];
592
        y2i   += instrides_scl[0];
593
    }
594
    return srcSliceH;
595
}
596

    
597
#endif
598

    
599

    
600
DEFCSP420_CVT (yuv2_rgba, out_rgba)
601
DEFCSP420_CVT (yuv2_argb, out_argb)
602
DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
603
DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
604

    
605

    
606
// uyvy|uyvy|uyvy|uyvy
607
// 0123 4567 89ab cdef
608
static
609
const vector unsigned char
610
    demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
611
                                              0x10,0x04,0x10,0x04,
612
                                              0x10,0x08,0x10,0x08,
613
                                              0x10,0x0c,0x10,0x0c),
614
    demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
615
                                              0x10,0x06,0x10,0x06,
616
                                              0x10,0x0A,0x10,0x0A,
617
                                              0x10,0x0E,0x10,0x0E),
618
    demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
619
                                              0x10,0x05,0x10,0x07,
620
                                              0x10,0x09,0x10,0x0B,
621
                                              0x10,0x0D,0x10,0x0F);
622

    
623
/*
624
  this is so I can play live CCIR raw video
625
*/
626
static int altivec_uyvy_rgb32 (SwsContext *c,
627
                               unsigned char **in, int *instrides,
628
                               int srcSliceY,        int srcSliceH,
629
                               unsigned char **oplanes, int *outstrides)
630
{
631
    int w = c->srcW;
632
    int h = srcSliceH;
633
    int i,j;
634
    vector unsigned char uyvy;
635
    vector signed   short Y,U,V;
636
    vector signed   short R0,G0,B0,R1,G1,B1;
637
    vector unsigned char  R,G,B;
638
    vector unsigned char *out;
639
    ubyte *img;
640

    
641
    img = in[0];
642
    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
643

    
644
    for (i=0;i<h;i++) {
645
        for (j=0;j<w/16;j++) {
646
            uyvy = vec_ld (0, img);
647
            U = (vector signed short)
648
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
649

    
650
            V = (vector signed short)
651
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
652

    
653
            Y = (vector signed short)
654
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
655

    
656
            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
657

    
658
            uyvy = vec_ld (16, img);
659
            U = (vector signed short)
660
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
661

    
662
            V = (vector signed short)
663
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
664

    
665
            Y = (vector signed short)
666
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
667

    
668
            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
669

    
670
            R  = vec_packclp (R0,R1);
671
            G  = vec_packclp (G0,G1);
672
            B  = vec_packclp (B0,B1);
673

    
674
            //      vec_mstbgr24 (R,G,B, out);
675
            out_rgba (R,G,B,out);
676

    
677
            img += 32;
678
        }
679
    }
680
    return srcSliceH;
681
}
682

    
683

    
684

    
685
/* Ok currently the acceleration routine only supports
686
   inputs of widths a multiple of 16
687
   and heights a multiple 2
688

689
   So we just fall back to the C codes for this.
690
*/
691
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
692
{
693
    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
694
        return NULL;
695

    
696
    /*
697
      and this seems not to matter too much I tried a bunch of
698
      videos with abnormal widths and mplayer crashes else where.
699
      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
700
      boom with X11 bad match.
701

702
    */
703
    if ((c->srcW & 0xf) != 0)    return NULL;
704

    
705
    switch (c->srcFormat) {
706
    case PIX_FMT_YUV410P:
707
    case PIX_FMT_YUV420P:
708
    /*case IMGFMT_CLPL:        ??? */
709
    case PIX_FMT_GRAY8:
710
    case PIX_FMT_NV12:
711
    case PIX_FMT_NV21:
712
        if ((c->srcH & 0x1) != 0)
713
            return NULL;
714

    
715
        switch(c->dstFormat){
716
        case PIX_FMT_RGB24:
717
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
718
            return altivec_yuv2_rgb24;
719
        case PIX_FMT_BGR24:
720
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
721
            return altivec_yuv2_bgr24;
722
        case PIX_FMT_ARGB:
723
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
724
            return altivec_yuv2_argb;
725
        case PIX_FMT_ABGR:
726
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
727
            return altivec_yuv2_abgr;
728
        case PIX_FMT_RGBA:
729
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
730
            return altivec_yuv2_rgba;
731
        case PIX_FMT_BGRA:
732
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
733
            return altivec_yuv2_bgra;
734
        default: return NULL;
735
        }
736
        break;
737

    
738
    case PIX_FMT_UYVY422:
739
        switch(c->dstFormat){
740
        case PIX_FMT_BGR32:
741
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
742
            return altivec_uyvy_rgb32;
743
        default: return NULL;
744
        }
745
        break;
746

    
747
    }
748
    return NULL;
749
}
750

    
751
static uint16_t roundToInt16(int64_t f){
752
    int r= (f + (1<<15))>>16;
753
         if (r<-0x7FFF) return 0x8000;
754
    else if (r> 0x7FFF) return 0x7FFF;
755
    else                return r;
756
}
757

    
758
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
759
{
760
    union {
761
        signed short tmp[8] __attribute__ ((aligned(16)));
762
        vector signed short vec;
763
    } buf;
764

    
765
    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
766
    buf.tmp[1] =  -256*brightness;                                      //oy
767
    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
768
    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
769
    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
770
    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
771

    
772

    
773
    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
774
    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
775
    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
776
    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
777
    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
778
    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
779
    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
780
#if 0
781
    {
782
    int i;
783
    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
784
    for (i=0; i<6; i++)
785
        printf("%s %d ", v[i],buf.tmp[i] );
786
        printf("\n");
787
    }
788
#endif
789
    return;
790
}
791

    
792

    
793
void
794
altivec_yuv2packedX (SwsContext *c,
795
                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
796
                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
797
                     uint8_t *dest, int dstW, int dstY)
798
{
799
    int i,j;
800
    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
801
    vector signed short R0,G0,B0,R1,G1,B1;
802

    
803
    vector unsigned char R,G,B;
804
    vector unsigned char *out,*nout;
805

    
806
    vector signed short   RND = vec_splat_s16(1<<3);
807
    vector unsigned short SCL = vec_splat_u16(4);
808
    unsigned long scratch[16] __attribute__ ((aligned (16)));
809

    
810
    vector signed short *YCoeffs, *CCoeffs;
811

    
812
    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
813
    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
814

    
815
    out = (vector unsigned char *)dest;
816

    
817
    for (i=0; i<dstW; i+=16){
818
        Y0 = RND;
819
        Y1 = RND;
820
        /* extract 16 coeffs from lumSrc */
821
        for (j=0; j<lumFilterSize; j++) {
822
            X0 = vec_ld (0,  &lumSrc[j][i]);
823
            X1 = vec_ld (16, &lumSrc[j][i]);
824
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
825
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
826
        }
827

    
828
        U = RND;
829
        V = RND;
830
        /* extract 8 coeffs from U,V */
831
        for (j=0; j<chrFilterSize; j++) {
832
            X  = vec_ld (0, &chrSrc[j][i/2]);
833
            U  = vec_mradds (X, CCoeffs[j], U);
834
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
835
            V  = vec_mradds (X, CCoeffs[j], V);
836
        }
837

    
838
        /* scale and clip signals */
839
        Y0 = vec_sra (Y0, SCL);
840
        Y1 = vec_sra (Y1, SCL);
841
        U  = vec_sra (U,  SCL);
842
        V  = vec_sra (V,  SCL);
843

    
844
        Y0 = vec_clip_s16 (Y0);
845
        Y1 = vec_clip_s16 (Y1);
846
        U  = vec_clip_s16 (U);
847
        V  = vec_clip_s16 (V);
848

    
849
        /* now we have
850
          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
851
          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
852

853
          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
854
          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
855
          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
856
        */
857

    
858
        U0 = vec_mergeh (U,U);
859
        V0 = vec_mergeh (V,V);
860

    
861
        U1 = vec_mergel (U,U);
862
        V1 = vec_mergel (V,V);
863

    
864
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
865
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
866

    
867
        R  = vec_packclp (R0,R1);
868
        G  = vec_packclp (G0,G1);
869
        B  = vec_packclp (B0,B1);
870

    
871
        switch(c->dstFormat) {
872
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
873
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
874
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
875
            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
876
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
877
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
878
            default:
879
            {
880
                /* If this is reached, the caller should have called yuv2packedXinC
881
                   instead. */
882
                static int printed_error_message;
883
                if (!printed_error_message) {
884
                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
885
                           sws_format_name(c->dstFormat));
886
                    printed_error_message=1;
887
                }
888
                return;
889
            }
890
        }
891
    }
892

    
893
    if (i < dstW) {
894
        i -= 16;
895

    
896
        Y0 = RND;
897
        Y1 = RND;
898
        /* extract 16 coeffs from lumSrc */
899
        for (j=0; j<lumFilterSize; j++) {
900
            X0 = vec_ld (0,  &lumSrc[j][i]);
901
            X1 = vec_ld (16, &lumSrc[j][i]);
902
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
903
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
904
        }
905

    
906
        U = RND;
907
        V = RND;
908
        /* extract 8 coeffs from U,V */
909
        for (j=0; j<chrFilterSize; j++) {
910
            X  = vec_ld (0, &chrSrc[j][i/2]);
911
            U  = vec_mradds (X, CCoeffs[j], U);
912
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
913
            V  = vec_mradds (X, CCoeffs[j], V);
914
        }
915

    
916
        /* scale and clip signals */
917
        Y0 = vec_sra (Y0, SCL);
918
        Y1 = vec_sra (Y1, SCL);
919
        U  = vec_sra (U,  SCL);
920
        V  = vec_sra (V,  SCL);
921

    
922
        Y0 = vec_clip_s16 (Y0);
923
        Y1 = vec_clip_s16 (Y1);
924
        U  = vec_clip_s16 (U);
925
        V  = vec_clip_s16 (V);
926

    
927
        /* now we have
928
           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
929
           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
930

931
           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
932
           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
933
           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
934
        */
935

    
936
        U0 = vec_mergeh (U,U);
937
        V0 = vec_mergeh (V,V);
938

    
939
        U1 = vec_mergel (U,U);
940
        V1 = vec_mergel (V,V);
941

    
942
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
943
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
944

    
945
        R  = vec_packclp (R0,R1);
946
        G  = vec_packclp (G0,G1);
947
        B  = vec_packclp (B0,B1);
948

    
949
        nout = (vector unsigned char *)scratch;
950
        switch(c->dstFormat) {
951
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
952
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
953
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
954
            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
955
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
956
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
957
            default:
958
                /* Unreachable, I think. */
959
                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
960
                       sws_format_name(c->dstFormat));
961
                return;
962
        }
963

    
964
        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
965
    }
966

    
967
}