Statistics
| Branch: | Revision:

ffmpeg / libswscale / yuv2rgb_altivec.c @ 4bdc44c7

History | View | Annotate | Download (38.3 KB)

1
/*
2
 * AltiVec acceleration for colorspace conversion
3
 *
4
 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
/*
24
convert I420 YV12 to RGB in various formats,
25
  it rejects images that are not in 420 formats
26
  it rejects images that don't have widths of multiples of 16
27
  it rejects images that don't have heights of multiples of 2
28
reject defers to C simulation codes.
29

30
lots of optimizations to be done here
31

32
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
33
   so we currently use max min to clip
34

35
2. the inefficient use of chroma loading needs a bit of brushing up
36

37
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
38

39

40
MODIFIED to calculate coeffs from currently selected color space.
41
MODIFIED core to be a macro which you spec the output format.
42
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
43
CORRECTED algorithim selection to be strict on input formats.
44
ADDED runtime detection of altivec.
45

46
ADDED altivec_yuv2packedX vertical scl + RGB converter
47

48
March 27,2004
49
PERFORMANCE ANALYSIS
50

51
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
52
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
53

54
720*480*30  ~10MPS
55

56
so we have roughly 10clocks per pixel this is too high something has to be wrong.
57

58
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
59

60
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
61
guaranteed to have the input video frame it was just decompressed so
62
it probably resides in L1 caches.  However we are creating the
63
output video stream this needs to use the DSTST instruction to
64
optimize for the cache.  We couple this with the fact that we are
65
not going to be visiting the input buffer again so we mark it Least
66
Recently Used.  This shaves 25% of the processor cycles off.
67

68
Now MEMCPY is the largest mips consumer in the system, probably due
69
to the inefficient X11 stuff.
70

71
GL libraries seem to be very slow on this machine 1.33Ghz PB running
72
Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
73
a versioning issues, however I have libGL.1.2.dylib for both
74
machines. ((We need to figure this out now))
75

76
GL2 libraries work now with patch for RGB32
77

78
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
79

80
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
81
*/
82

    
83
#include <stdio.h>
84
#include <stdlib.h>
85
#include <string.h>
86
#include <inttypes.h>
87
#include <assert.h>
88
#include "config.h"
89
#ifdef HAVE_MALLOC_H
90
#include <malloc.h>
91
#endif
92
#include "rgb2rgb.h"
93
#include "swscale.h"
94
#include "swscale_internal.h"
95

    
96
#undef PROFILE_THE_BEAST
97
#undef INC_SCALING
98

    
99
typedef unsigned char ubyte;
100
typedef signed char   sbyte;
101

    
102

    
103
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
104
   homogeneous vector registers x0,x1,x2 are interleaved with the
105
   following technique:
106

107
      o0 = vec_mergeh (x0,x1);
108
      o1 = vec_perm (o0, x2, perm_rgb_0);
109
      o2 = vec_perm (o0, x2, perm_rgb_1);
110
      o3 = vec_mergel (x0,x1);
111
      o4 = vec_perm (o3,o2,perm_rgb_2);
112
      o5 = vec_perm (o3,o2,perm_rgb_3);
113

114
  perm_rgb_0:   o0(RG).h v1(B) --> o1*
115
              0   1  2   3   4
116
             rgbr|gbrg|brgb|rgbr
117
             0010 0100 1001 0010
118
             0102 3145 2673 894A
119

120
  perm_rgb_1:   o0(RG).h v1(B) --> o2
121
              0   1  2   3   4
122
             gbrg|brgb|bbbb|bbbb
123
             0100 1001 1111 1111
124
             B5CD 6EF7 89AB CDEF
125

126
  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
127
              0   1  2   3   4
128
             gbrg|brgb|rgbr|gbrg
129
             1111 1111 0010 0100
130
             89AB CDEF 0182 3945
131

132
  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
133
              0   1  2   3   4
134
             brgb|rgbr|gbrg|brgb
135
             1001 0010 0100 1001
136
             a67b 89cA BdCD eEFf
137

138
*/
139
static
140
const vector unsigned char
141
  perm_rgb_0 = (const vector unsigned char)AVV(0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
142
                                               0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a),
143
  perm_rgb_1 = (const vector unsigned char)AVV(0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
144
                                               0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f),
145
  perm_rgb_2 = (const vector unsigned char)AVV(0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
146
                                               0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05),
147
  perm_rgb_3 = (const vector unsigned char)AVV(0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
148
                                               0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f);
149

    
150
#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
151
do {                                        \
152
    typeof(x0) o0,o2,o3;                    \
153
        o0 = vec_mergeh (x0,x1);            \
154
        y0 = vec_perm (o0, x2, perm_rgb_0); \
155
        o2 = vec_perm (o0, x2, perm_rgb_1); \
156
        o3 = vec_mergel (x0,x1);            \
157
        y1 = vec_perm (o3,o2,perm_rgb_2);   \
158
        y2 = vec_perm (o3,o2,perm_rgb_3);   \
159
} while(0)
160

    
161
#define vec_mstbgr24(x0,x1,x2,ptr)      \
162
do {                                    \
163
    typeof(x0) _0,_1,_2;                \
164
    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
165
    vec_st (_0, 0, ptr++);              \
166
    vec_st (_1, 0, ptr++);              \
167
    vec_st (_2, 0, ptr++);              \
168
}  while (0);
169

    
170
#define vec_mstrgb24(x0,x1,x2,ptr)      \
171
do {                                    \
172
    typeof(x0) _0,_1,_2;                \
173
    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
174
    vec_st (_0, 0, ptr++);              \
175
    vec_st (_1, 0, ptr++);              \
176
    vec_st (_2, 0, ptr++);              \
177
}  while (0);
178

    
179
/* pack the pixels in rgb0 format
180
   msb R
181
   lsb 0
182
*/
183
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
184
do {                                                                          \
185
    T _0,_1,_2,_3;                                                            \
186
    _0 = vec_mergeh (x0,x1);                                                  \
187
    _1 = vec_mergeh (x2,x3);                                                  \
188
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
189
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
190
    vec_st (_2, 0*16, (T *)ptr);                                              \
191
    vec_st (_3, 1*16, (T *)ptr);                                              \
192
    _0 = vec_mergel (x0,x1);                                                  \
193
    _1 = vec_mergel (x2,x3);                                                  \
194
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
195
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
196
    vec_st (_2, 2*16, (T *)ptr);                                              \
197
    vec_st (_3, 3*16, (T *)ptr);                                              \
198
    ptr += 4;                                                                 \
199
}  while (0);
200

    
201
/*
202

203
  | 1     0       1.4021   | | Y |
204
  | 1    -0.3441 -0.7142   |x| Cb|
205
  | 1     1.7718  0        | | Cr|
206

207

208
  Y:      [-128 127]
209
  Cb/Cr : [-128 127]
210

211
  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
212

213
*/
214

    
215

    
216

    
217

    
218
#define vec_unh(x) \
219
    (vector signed short) \
220
        vec_perm(x,(typeof(x))AVV(0),\
221
                 (vector unsigned char)AVV(0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
222
                                           0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07))
223
#define vec_unl(x) \
224
    (vector signed short) \
225
        vec_perm(x,(typeof(x))AVV(0),\
226
                 (vector unsigned char)AVV(0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
227
                                           0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F))
228

    
229
#define vec_clip_s16(x) \
230
    vec_max (vec_min (x, (vector signed short)AVV(235,235,235,235,235,235,235,235)),\
231
                         (vector signed short)AVV( 16, 16, 16, 16, 16, 16, 16, 16))
232

    
233
#define vec_packclp(x,y) \
234
    (vector unsigned char)vec_packs \
235
        ((vector unsigned short)vec_max (x,(vector signed short) AVV(0)), \
236
         (vector unsigned short)vec_max (y,(vector signed short) AVV(0)))
237

    
238
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,a,a,ptr)
239

    
240

    
241
static inline void cvtyuvtoRGB (SwsContext *c,
242
                                vector signed short Y, vector signed short U, vector signed short V,
243
                                vector signed short *R, vector signed short *G, vector signed short *B)
244
{
245
    vector signed   short vx,ux,uvx;
246

    
247
    Y = vec_mradds (Y, c->CY, c->OY);
248
    U  = vec_sub (U,(vector signed short)
249
                    vec_splat((vector signed short)AVV(128),0));
250
    V  = vec_sub (V,(vector signed short)
251
                    vec_splat((vector signed short)AVV(128),0));
252

    
253
    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
254
    ux = vec_sl (U, c->CSHIFT);
255
    *B = vec_mradds (ux, c->CBU, Y);
256

    
257
    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
258
    vx = vec_sl (V, c->CSHIFT);
259
    *R = vec_mradds (vx, c->CRV, Y);
260

    
261
    // uvx = ((CGU*u) + (CGV*v))>>15;
262
    uvx = vec_mradds (U, c->CGU, Y);
263
    *G  = vec_mradds (V, c->CGV, uvx);
264
}
265

    
266

    
267
/*
268
  ------------------------------------------------------------------------------
269
  CS converters
270
  ------------------------------------------------------------------------------
271
*/
272

    
273

    
274
#define DEFCSP420_CVT(name,out_pixels)                                  \
275
static int altivec_##name (SwsContext *c,                               \
276
                           unsigned char **in, int *instrides,          \
277
                           int srcSliceY,        int srcSliceH,         \
278
                           unsigned char **oplanes, int *outstrides)    \
279
{                                                                       \
280
    int w = c->srcW;                                                    \
281
    int h = srcSliceH;                                                  \
282
    int i,j;                                                            \
283
    int instrides_scl[3];                                               \
284
    vector unsigned char y0,y1;                                         \
285
                                                                        \
286
    vector signed char  u,v;                                            \
287
                                                                        \
288
    vector signed short Y0,Y1,Y2,Y3;                                    \
289
    vector signed short U,V;                                            \
290
    vector signed short vx,ux,uvx;                                      \
291
    vector signed short vx0,ux0,uvx0;                                   \
292
    vector signed short vx1,ux1,uvx1;                                   \
293
    vector signed short R0,G0,B0;                                       \
294
    vector signed short R1,G1,B1;                                       \
295
    vector unsigned char R,G,B;                                         \
296
                                                                        \
297
    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
298
    vector unsigned char align_perm;                                    \
299
                                                                        \
300
    vector signed short                                                 \
301
        lCY  = c->CY,                                                   \
302
        lOY  = c->OY,                                                   \
303
        lCRV = c->CRV,                                                  \
304
        lCBU = c->CBU,                                                  \
305
        lCGU = c->CGU,                                                  \
306
        lCGV = c->CGV;                                                  \
307
                                                                        \
308
    vector unsigned short lCSHIFT = c->CSHIFT;                          \
309
                                                                        \
310
    ubyte *y1i   = in[0];                                               \
311
    ubyte *y2i   = in[0]+instrides[0];                                  \
312
    ubyte *ui    = in[1];                                               \
313
    ubyte *vi    = in[2];                                               \
314
                                                                        \
315
    vector unsigned char *oute                                          \
316
        = (vector unsigned char *)                                      \
317
            (oplanes[0]+srcSliceY*outstrides[0]);                       \
318
    vector unsigned char *outo                                          \
319
        = (vector unsigned char *)                                      \
320
            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
321
                                                                        \
322
                                                                        \
323
    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
324
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
325
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
326
                                                                        \
327
                                                                        \
328
    for (i=0;i<h/2;i++) {                                               \
329
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
330
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
331
                                                                        \
332
        for (j=0;j<w/16;j++) {                                          \
333
                                                                        \
334
            y1ivP = (vector unsigned char *)y1i;                        \
335
            y2ivP = (vector unsigned char *)y2i;                        \
336
            uivP  = (vector unsigned char *)ui;                         \
337
            vivP  = (vector unsigned char *)vi;                         \
338
                                                                        \
339
            align_perm = vec_lvsl (0, y1i);                             \
340
            y0 = (vector unsigned char)                                 \
341
                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
342
                                                                        \
343
            align_perm = vec_lvsl (0, y2i);                             \
344
            y1 = (vector unsigned char)                                 \
345
                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
346
                                                                        \
347
            align_perm = vec_lvsl (0, ui);                              \
348
            u = (vector signed char)                                    \
349
                vec_perm (uivP[0], uivP[1], align_perm);                \
350
                                                                        \
351
            align_perm = vec_lvsl (0, vi);                              \
352
            v = (vector signed char)                                    \
353
                vec_perm (vivP[0], vivP[1], align_perm);                \
354
                                                                        \
355
            u  = (vector signed char)                                   \
356
                 vec_sub (u,(vector signed char)                        \
357
                          vec_splat((vector signed char)AVV(128),0));   \
358
            v  = (vector signed char)                                   \
359
                 vec_sub (v,(vector signed char)                        \
360
                          vec_splat((vector signed char)AVV(128),0));   \
361
                                                                        \
362
            U  = vec_unpackh (u);                                       \
363
            V  = vec_unpackh (v);                                       \
364
                                                                        \
365
                                                                        \
366
            Y0 = vec_unh (y0);                                          \
367
            Y1 = vec_unl (y0);                                          \
368
            Y2 = vec_unh (y1);                                          \
369
            Y3 = vec_unl (y1);                                          \
370
                                                                        \
371
            Y0 = vec_mradds (Y0, lCY, lOY);                             \
372
            Y1 = vec_mradds (Y1, lCY, lOY);                             \
373
            Y2 = vec_mradds (Y2, lCY, lOY);                             \
374
            Y3 = vec_mradds (Y3, lCY, lOY);                             \
375
                                                                        \
376
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
377
            ux = vec_sl (U, lCSHIFT);                                   \
378
            ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));    \
379
            ux0  = vec_mergeh (ux,ux);                                  \
380
            ux1  = vec_mergel (ux,ux);                                  \
381
                                                                        \
382
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
383
            vx = vec_sl (V, lCSHIFT);                                   \
384
            vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));    \
385
            vx0  = vec_mergeh (vx,vx);                                  \
386
            vx1  = vec_mergel (vx,vx);                                  \
387
                                                                        \
388
            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
389
            uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));    \
390
            uvx = vec_mradds (V, lCGV, uvx);                            \
391
            uvx0 = vec_mergeh (uvx,uvx);                                \
392
            uvx1 = vec_mergel (uvx,uvx);                                \
393
                                                                        \
394
            R0 = vec_add (Y0,vx0);                                      \
395
            G0 = vec_add (Y0,uvx0);                                     \
396
            B0 = vec_add (Y0,ux0);                                      \
397
            R1 = vec_add (Y1,vx1);                                      \
398
            G1 = vec_add (Y1,uvx1);                                     \
399
            B1 = vec_add (Y1,ux1);                                      \
400
                                                                        \
401
            R  = vec_packclp (R0,R1);                                   \
402
            G  = vec_packclp (G0,G1);                                   \
403
            B  = vec_packclp (B0,B1);                                   \
404
                                                                        \
405
            out_pixels(R,G,B,oute);                                     \
406
                                                                        \
407
            R0 = vec_add (Y2,vx0);                                      \
408
            G0 = vec_add (Y2,uvx0);                                     \
409
            B0 = vec_add (Y2,ux0);                                      \
410
            R1 = vec_add (Y3,vx1);                                      \
411
            G1 = vec_add (Y3,uvx1);                                     \
412
            B1 = vec_add (Y3,ux1);                                      \
413
            R  = vec_packclp (R0,R1);                                   \
414
            G  = vec_packclp (G0,G1);                                   \
415
            B  = vec_packclp (B0,B1);                                   \
416
                                                                        \
417
                                                                        \
418
            out_pixels(R,G,B,outo);                                     \
419
                                                                        \
420
            y1i  += 16;                                                 \
421
            y2i  += 16;                                                 \
422
            ui   += 8;                                                  \
423
            vi   += 8;                                                  \
424
                                                                        \
425
        }                                                               \
426
                                                                        \
427
        outo  += (outstrides[0])>>4;                                    \
428
        oute  += (outstrides[0])>>4;                                    \
429
                                                                        \
430
        ui    += instrides_scl[1];                                      \
431
        vi    += instrides_scl[2];                                      \
432
        y1i   += instrides_scl[0];                                      \
433
        y2i   += instrides_scl[0];                                      \
434
    }                                                                   \
435
    return srcSliceH;                                                   \
436
}
437

    
438

    
439
#define out_abgr(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),c,b,a,ptr)
440
#define out_bgra(a,b,c,ptr)  vec_mstrgb32(typeof(a),c,b,a,((typeof (a))AVV(0)),ptr)
441
#define out_rgba(a,b,c,ptr)  vec_mstrgb32(typeof(a),a,b,c,((typeof (a))AVV(0)),ptr)
442
#define out_argb(a,b,c,ptr)  vec_mstrgb32(typeof(a),((typeof (a))AVV(0)),a,b,c,ptr)
443
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
444
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
445

    
446
DEFCSP420_CVT (yuv2_abgr, out_abgr)
447
#if 1
448
DEFCSP420_CVT (yuv2_bgra, out_bgra)
449
#else
450
static int altivec_yuv2_bgra32 (SwsContext *c,
451
                                unsigned char **in, int *instrides,
452
                                int srcSliceY,        int srcSliceH,
453
                                unsigned char **oplanes, int *outstrides)
454
{
455
    int w = c->srcW;
456
    int h = srcSliceH;
457
    int i,j;
458
    int instrides_scl[3];
459
    vector unsigned char y0,y1;
460

    
461
    vector signed char  u,v;
462

    
463
    vector signed short Y0,Y1,Y2,Y3;
464
    vector signed short U,V;
465
    vector signed short vx,ux,uvx;
466
    vector signed short vx0,ux0,uvx0;
467
    vector signed short vx1,ux1,uvx1;
468
    vector signed short R0,G0,B0;
469
    vector signed short R1,G1,B1;
470
    vector unsigned char R,G,B;
471

    
472
    vector unsigned char *uivP, *vivP;
473
    vector unsigned char align_perm;
474

    
475
    vector signed short
476
        lCY  = c->CY,
477
        lOY  = c->OY,
478
        lCRV = c->CRV,
479
        lCBU = c->CBU,
480
        lCGU = c->CGU,
481
        lCGV = c->CGV;
482

    
483
    vector unsigned short lCSHIFT = c->CSHIFT;
484

    
485
    ubyte *y1i   = in[0];
486
    ubyte *y2i   = in[0]+w;
487
    ubyte *ui    = in[1];
488
    ubyte *vi    = in[2];
489

    
490
    vector unsigned char *oute
491
        = (vector unsigned char *)
492
          (oplanes[0]+srcSliceY*outstrides[0]);
493
    vector unsigned char *outo
494
        = (vector unsigned char *)
495
          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
496

    
497

    
498
    instrides_scl[0] = instrides[0];
499
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
500
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
501

    
502

    
503
    for (i=0;i<h/2;i++) {
504
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
505
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
506

    
507
        for (j=0;j<w/16;j++) {
508

    
509
            y0 = vec_ldl (0,y1i);
510
            y1 = vec_ldl (0,y2i);
511
            uivP = (vector unsigned char *)ui;
512
            vivP = (vector unsigned char *)vi;
513

    
514
            align_perm = vec_lvsl (0, ui);
515
            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
516

    
517
            align_perm = vec_lvsl (0, vi);
518
            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
519
            u  = (vector signed char)
520
                 vec_sub (u,(vector signed char)
521
                          vec_splat((vector signed char)AVV(128),0));
522

    
523
            v  = (vector signed char)
524
                 vec_sub (v, (vector signed char)
525
                          vec_splat((vector signed char)AVV(128),0));
526

    
527
            U  = vec_unpackh (u);
528
            V  = vec_unpackh (v);
529

    
530

    
531
            Y0 = vec_unh (y0);
532
            Y1 = vec_unl (y0);
533
            Y2 = vec_unh (y1);
534
            Y3 = vec_unl (y1);
535

    
536
            Y0 = vec_mradds (Y0, lCY, lOY);
537
            Y1 = vec_mradds (Y1, lCY, lOY);
538
            Y2 = vec_mradds (Y2, lCY, lOY);
539
            Y3 = vec_mradds (Y3, lCY, lOY);
540

    
541
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
542
            ux = vec_sl (U, lCSHIFT);
543
            ux = vec_mradds (ux, lCBU, (vector signed short)AVV(0));
544
            ux0  = vec_mergeh (ux,ux);
545
            ux1  = vec_mergel (ux,ux);
546

    
547
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
548
            vx = vec_sl (V, lCSHIFT);
549
            vx = vec_mradds (vx, lCRV, (vector signed short)AVV(0));
550
            vx0  = vec_mergeh (vx,vx);
551
            vx1  = vec_mergel (vx,vx);
552
            /* uvx = ((CGU*u) + (CGV*v))>>15 */
553
            uvx = vec_mradds (U, lCGU, (vector signed short)AVV(0));
554
            uvx = vec_mradds (V, lCGV, uvx);
555
            uvx0 = vec_mergeh (uvx,uvx);
556
            uvx1 = vec_mergel (uvx,uvx);
557
            R0 = vec_add (Y0,vx0);
558
            G0 = vec_add (Y0,uvx0);
559
            B0 = vec_add (Y0,ux0);
560
            R1 = vec_add (Y1,vx1);
561
            G1 = vec_add (Y1,uvx1);
562
            B1 = vec_add (Y1,ux1);
563
            R  = vec_packclp (R0,R1);
564
            G  = vec_packclp (G0,G1);
565
            B  = vec_packclp (B0,B1);
566

    
567
            out_argb(R,G,B,oute);
568
            R0 = vec_add (Y2,vx0);
569
            G0 = vec_add (Y2,uvx0);
570
            B0 = vec_add (Y2,ux0);
571
            R1 = vec_add (Y3,vx1);
572
            G1 = vec_add (Y3,uvx1);
573
            B1 = vec_add (Y3,ux1);
574
            R  = vec_packclp (R0,R1);
575
            G  = vec_packclp (G0,G1);
576
            B  = vec_packclp (B0,B1);
577

    
578
            out_argb(R,G,B,outo);
579
            y1i  += 16;
580
            y2i  += 16;
581
            ui   += 8;
582
            vi   += 8;
583

    
584
        }
585

    
586
        outo  += (outstrides[0])>>4;
587
        oute  += (outstrides[0])>>4;
588

    
589
        ui    += instrides_scl[1];
590
        vi    += instrides_scl[2];
591
        y1i   += instrides_scl[0];
592
        y2i   += instrides_scl[0];
593
    }
594
    return srcSliceH;
595
}
596

    
597
#endif
598

    
599

    
600
DEFCSP420_CVT (yuv2_rgba, out_rgba)
601
DEFCSP420_CVT (yuv2_argb, out_argb)
602
DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
603
DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
604

    
605

    
606
// uyvy|uyvy|uyvy|uyvy
607
// 0123 4567 89ab cdef
608
static
609
const vector unsigned char
610
    demux_u = (const vector unsigned char)AVV(0x10,0x00,0x10,0x00,
611
                                              0x10,0x04,0x10,0x04,
612
                                              0x10,0x08,0x10,0x08,
613
                                              0x10,0x0c,0x10,0x0c),
614
    demux_v = (const vector unsigned char)AVV(0x10,0x02,0x10,0x02,
615
                                              0x10,0x06,0x10,0x06,
616
                                              0x10,0x0A,0x10,0x0A,
617
                                              0x10,0x0E,0x10,0x0E),
618
    demux_y = (const vector unsigned char)AVV(0x10,0x01,0x10,0x03,
619
                                              0x10,0x05,0x10,0x07,
620
                                              0x10,0x09,0x10,0x0B,
621
                                              0x10,0x0D,0x10,0x0F);
622

    
623
/*
624
  this is so I can play live CCIR raw video
625
*/
626
static int altivec_uyvy_rgb32 (SwsContext *c,
627
                               unsigned char **in, int *instrides,
628
                               int srcSliceY,        int srcSliceH,
629
                               unsigned char **oplanes, int *outstrides)
630
{
631
    int w = c->srcW;
632
    int h = srcSliceH;
633
    int i,j;
634
    vector unsigned char uyvy;
635
    vector signed   short Y,U,V;
636
    vector signed   short R0,G0,B0,R1,G1,B1;
637
    vector unsigned char  R,G,B;
638
    vector unsigned char *out;
639
    ubyte *img;
640

    
641
    img = in[0];
642
    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
643

    
644
    for (i=0;i<h;i++) {
645
        for (j=0;j<w/16;j++) {
646
            uyvy = vec_ld (0, img);
647
            U = (vector signed short)
648
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
649

    
650
            V = (vector signed short)
651
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
652

    
653
            Y = (vector signed short)
654
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
655

    
656
            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
657

    
658
            uyvy = vec_ld (16, img);
659
            U = (vector signed short)
660
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_u);
661

    
662
            V = (vector signed short)
663
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_v);
664

    
665
            Y = (vector signed short)
666
                vec_perm (uyvy, (vector unsigned char)AVV(0), demux_y);
667

    
668
            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
669

    
670
            R  = vec_packclp (R0,R1);
671
            G  = vec_packclp (G0,G1);
672
            B  = vec_packclp (B0,B1);
673

    
674
            //      vec_mstbgr24 (R,G,B, out);
675
            out_rgba (R,G,B,out);
676

    
677
            img += 32;
678
        }
679
    }
680
    return srcSliceH;
681
}
682

    
683

    
684

    
685
/* Ok currently the acceleration routine only supports
686
   inputs of widths a multiple of 16
687
   and heights a multiple 2
688

689
   So we just fall back to the C codes for this.
690
*/
691
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
692
{
693
    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
694
        return NULL;
695

    
696
    /*
697
      and this seems not to matter too much I tried a bunch of
698
      videos with abnormal widths and MPlayer crashes elsewhere.
699
      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
700
      boom with X11 bad match.
701

702
    */
703
    if ((c->srcW & 0xf) != 0)    return NULL;
704

    
705
    switch (c->srcFormat) {
706
    case PIX_FMT_YUV410P:
707
    case PIX_FMT_YUV420P:
708
    /*case IMGFMT_CLPL:        ??? */
709
    case PIX_FMT_GRAY8:
710
    case PIX_FMT_NV12:
711
    case PIX_FMT_NV21:
712
        if ((c->srcH & 0x1) != 0)
713
            return NULL;
714

    
715
        switch(c->dstFormat){
716
        case PIX_FMT_RGB24:
717
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
718
            return altivec_yuv2_rgb24;
719
        case PIX_FMT_BGR24:
720
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
721
            return altivec_yuv2_bgr24;
722
        case PIX_FMT_ARGB:
723
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
724
            return altivec_yuv2_argb;
725
        case PIX_FMT_ABGR:
726
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
727
            return altivec_yuv2_abgr;
728
        case PIX_FMT_RGBA:
729
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
730
            return altivec_yuv2_rgba;
731
        case PIX_FMT_BGRA:
732
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
733
            return altivec_yuv2_bgra;
734
        default: return NULL;
735
        }
736
        break;
737

    
738
    case PIX_FMT_UYVY422:
739
        switch(c->dstFormat){
740
        case PIX_FMT_BGR32:
741
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
742
            return altivec_uyvy_rgb32;
743
        default: return NULL;
744
        }
745
        break;
746

    
747
    }
748
    return NULL;
749
}
750

    
751
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
752
{
753
    union {
754
        signed short tmp[8] __attribute__ ((aligned(16)));
755
        vector signed short vec;
756
    } buf;
757

    
758
    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
759
    buf.tmp[1] =  -256*brightness;                                      //oy
760
    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
761
    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
762
    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
763
    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
764

    
765

    
766
    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
767
    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
768
    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
769
    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
770
    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
771
    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
772
    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
773
#if 0
774
    {
775
    int i;
776
    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
777
    for (i=0; i<6; i++)
778
        printf("%s %d ", v[i],buf.tmp[i] );
779
        printf("\n");
780
    }
781
#endif
782
    return;
783
}
784

    
785

    
786
void
787
altivec_yuv2packedX (SwsContext *c,
788
                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
789
                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
790
                     uint8_t *dest, int dstW, int dstY)
791
{
792
    int i,j;
793
    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
794
    vector signed short R0,G0,B0,R1,G1,B1;
795

    
796
    vector unsigned char R,G,B;
797
    vector unsigned char *out,*nout;
798

    
799
    vector signed short   RND = vec_splat_s16(1<<3);
800
    vector unsigned short SCL = vec_splat_u16(4);
801
    unsigned long scratch[16] __attribute__ ((aligned (16)));
802

    
803
    vector signed short *YCoeffs, *CCoeffs;
804

    
805
    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
806
    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
807

    
808
    out = (vector unsigned char *)dest;
809

    
810
    for (i=0; i<dstW; i+=16){
811
        Y0 = RND;
812
        Y1 = RND;
813
        /* extract 16 coeffs from lumSrc */
814
        for (j=0; j<lumFilterSize; j++) {
815
            X0 = vec_ld (0,  &lumSrc[j][i]);
816
            X1 = vec_ld (16, &lumSrc[j][i]);
817
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
818
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
819
        }
820

    
821
        U = RND;
822
        V = RND;
823
        /* extract 8 coeffs from U,V */
824
        for (j=0; j<chrFilterSize; j++) {
825
            X  = vec_ld (0, &chrSrc[j][i/2]);
826
            U  = vec_mradds (X, CCoeffs[j], U);
827
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
828
            V  = vec_mradds (X, CCoeffs[j], V);
829
        }
830

    
831
        /* scale and clip signals */
832
        Y0 = vec_sra (Y0, SCL);
833
        Y1 = vec_sra (Y1, SCL);
834
        U  = vec_sra (U,  SCL);
835
        V  = vec_sra (V,  SCL);
836

    
837
        Y0 = vec_clip_s16 (Y0);
838
        Y1 = vec_clip_s16 (Y1);
839
        U  = vec_clip_s16 (U);
840
        V  = vec_clip_s16 (V);
841

    
842
        /* now we have
843
          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
844
          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
845

846
          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
847
          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
848
          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
849
        */
850

    
851
        U0 = vec_mergeh (U,U);
852
        V0 = vec_mergeh (V,V);
853

    
854
        U1 = vec_mergel (U,U);
855
        V1 = vec_mergel (V,V);
856

    
857
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
858
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
859

    
860
        R  = vec_packclp (R0,R1);
861
        G  = vec_packclp (G0,G1);
862
        B  = vec_packclp (B0,B1);
863

    
864
        switch(c->dstFormat) {
865
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
866
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
867
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
868
            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
869
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
870
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
871
            default:
872
            {
873
                /* If this is reached, the caller should have called yuv2packedXinC
874
                   instead. */
875
                static int printed_error_message;
876
                if (!printed_error_message) {
877
                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
878
                           sws_format_name(c->dstFormat));
879
                    printed_error_message=1;
880
                }
881
                return;
882
            }
883
        }
884
    }
885

    
886
    if (i < dstW) {
887
        i -= 16;
888

    
889
        Y0 = RND;
890
        Y1 = RND;
891
        /* extract 16 coeffs from lumSrc */
892
        for (j=0; j<lumFilterSize; j++) {
893
            X0 = vec_ld (0,  &lumSrc[j][i]);
894
            X1 = vec_ld (16, &lumSrc[j][i]);
895
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
896
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
897
        }
898

    
899
        U = RND;
900
        V = RND;
901
        /* extract 8 coeffs from U,V */
902
        for (j=0; j<chrFilterSize; j++) {
903
            X  = vec_ld (0, &chrSrc[j][i/2]);
904
            U  = vec_mradds (X, CCoeffs[j], U);
905
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
906
            V  = vec_mradds (X, CCoeffs[j], V);
907
        }
908

    
909
        /* scale and clip signals */
910
        Y0 = vec_sra (Y0, SCL);
911
        Y1 = vec_sra (Y1, SCL);
912
        U  = vec_sra (U,  SCL);
913
        V  = vec_sra (V,  SCL);
914

    
915
        Y0 = vec_clip_s16 (Y0);
916
        Y1 = vec_clip_s16 (Y1);
917
        U  = vec_clip_s16 (U);
918
        V  = vec_clip_s16 (V);
919

    
920
        /* now we have
921
           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
922
           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
923

924
           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
925
           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
926
           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
927
        */
928

    
929
        U0 = vec_mergeh (U,U);
930
        V0 = vec_mergeh (V,V);
931

    
932
        U1 = vec_mergel (U,U);
933
        V1 = vec_mergel (V,V);
934

    
935
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
936
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
937

    
938
        R  = vec_packclp (R0,R1);
939
        G  = vec_packclp (G0,G1);
940
        B  = vec_packclp (B0,B1);
941

    
942
        nout = (vector unsigned char *)scratch;
943
        switch(c->dstFormat) {
944
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
945
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
946
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
947
            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
948
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
949
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
950
            default:
951
                /* Unreachable, I think. */
952
                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
953
                       sws_format_name(c->dstFormat));
954
                return;
955
        }
956

    
957
        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
958
    }
959

    
960
}