Statistics
| Branch: | Revision:

ffmpeg / libswscale / ppc / yuv2rgb_altivec.c @ 6cce7cab

History | View | Annotate | Download (37.5 KB)

1
/*
2
 * AltiVec acceleration for colorspace conversion
3
 *
4
 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
/*
24
Convert I420 YV12 to RGB in various formats,
25
  it rejects images that are not in 420 formats,
26
  it rejects images that don't have widths of multiples of 16,
27
  it rejects images that don't have heights of multiples of 2.
28
Reject defers to C simulation code.
29

30
Lots of optimizations to be done here.
31

32
1. Need to fix saturation code. I just couldn't get it to fly with packs
33
   and adds, so we currently use max/min to clip.
34

35
2. The inefficient use of chroma loading needs a bit of brushing up.
36

37
3. Analysis of pipeline stalls needs to be done. Use shark to identify
38
   pipeline stalls.
39

40

41
MODIFIED to calculate coeffs from currently selected color space.
42
MODIFIED core to be a macro where you specify the output format.
43
ADDED UYVY conversion which is never called due to some thing in swscale.
44
CORRECTED algorithim selection to be strict on input formats.
45
ADDED runtime detection of AltiVec.
46

47
ADDED altivec_yuv2packedX vertical scl + RGB converter
48

49
March 27,2004
50
PERFORMANCE ANALYSIS
51

52
The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53
used as test.
54
The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55
same sequence.
56

57
720 * 480 * 30  ~10MPS
58

59
so we have roughly 10 clocks per pixel. This is too high, something has
60
to be wrong.
61

62
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63
need for vec_min.
64

65
OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66
the input video frame, it was just decompressed so it probably resides in L1
67
caches. However, we are creating the output video stream. This needs to use the
68
DSTST instruction to optimize for the cache. We couple this with the fact that
69
we are not going to be visiting the input buffer again so we mark it Least
70
Recently Used. This shaves 25% of the processor cycles off.
71

72
Now memcpy is the largest mips consumer in the system, probably due
73
to the inefficient X11 stuff.
74

75
GL libraries seem to be very slow on this machine 1.33Ghz PB running
76
Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77
a versioning issue, however I have libGL.1.2.dylib for both
78
machines. (We need to figure this out now.)
79

80
GL2 libraries work now with patch for RGB32.
81

82
NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83

84
Integrated luma prescaling adjustment for saturation/contrast/brightness
85
adjustment.
86
*/
87

    
88
#include <stdio.h>
89
#include <stdlib.h>
90
#include <string.h>
91
#include <inttypes.h>
92
#include <assert.h>
93
#include "config.h"
94
#include "libswscale/rgb2rgb.h"
95
#include "libswscale/swscale.h"
96
#include "libswscale/swscale_internal.h"
97

    
98
#undef PROFILE_THE_BEAST
99
#undef INC_SCALING
100

    
101
typedef unsigned char ubyte;
102
typedef signed char   sbyte;
103

    
104

    
105
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
106
   homogeneous vector registers x0,x1,x2 are interleaved with the
107
   following technique:
108

109
      o0 = vec_mergeh (x0,x1);
110
      o1 = vec_perm (o0, x2, perm_rgb_0);
111
      o2 = vec_perm (o0, x2, perm_rgb_1);
112
      o3 = vec_mergel (x0,x1);
113
      o4 = vec_perm (o3,o2,perm_rgb_2);
114
      o5 = vec_perm (o3,o2,perm_rgb_3);
115

116
  perm_rgb_0:   o0(RG).h v1(B) --> o1*
117
              0   1  2   3   4
118
             rgbr|gbrg|brgb|rgbr
119
             0010 0100 1001 0010
120
             0102 3145 2673 894A
121

122
  perm_rgb_1:   o0(RG).h v1(B) --> o2
123
              0   1  2   3   4
124
             gbrg|brgb|bbbb|bbbb
125
             0100 1001 1111 1111
126
             B5CD 6EF7 89AB CDEF
127

128
  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
129
              0   1  2   3   4
130
             gbrg|brgb|rgbr|gbrg
131
             1111 1111 0010 0100
132
             89AB CDEF 0182 3945
133

134
  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
135
              0   1  2   3   4
136
             brgb|rgbr|gbrg|brgb
137
             1001 0010 0100 1001
138
             a67b 89cA BdCD eEFf
139

140
*/
141
static
142
const vector unsigned char
143
  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
144
                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
145
  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
146
                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
147
  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
148
                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
149
  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
150
                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
151

    
152
#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
153
do {                                        \
154
    __typeof__(x0) o0,o2,o3;                \
155
        o0 = vec_mergeh (x0,x1);            \
156
        y0 = vec_perm (o0, x2, perm_rgb_0); \
157
        o2 = vec_perm (o0, x2, perm_rgb_1); \
158
        o3 = vec_mergel (x0,x1);            \
159
        y1 = vec_perm (o3,o2,perm_rgb_2);   \
160
        y2 = vec_perm (o3,o2,perm_rgb_3);   \
161
} while(0)
162

    
163
#define vec_mstbgr24(x0,x1,x2,ptr)      \
164
do {                                    \
165
    __typeof__(x0) _0,_1,_2;            \
166
    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
167
    vec_st (_0, 0, ptr++);              \
168
    vec_st (_1, 0, ptr++);              \
169
    vec_st (_2, 0, ptr++);              \
170
}  while (0)
171

    
172
#define vec_mstrgb24(x0,x1,x2,ptr)      \
173
do {                                    \
174
    __typeof__(x0) _0,_1,_2;            \
175
    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
176
    vec_st (_0, 0, ptr++);              \
177
    vec_st (_1, 0, ptr++);              \
178
    vec_st (_2, 0, ptr++);              \
179
}  while (0)
180

    
181
/* pack the pixels in rgb0 format
182
   msb R
183
   lsb 0
184
*/
185
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
186
do {                                                                          \
187
    T _0,_1,_2,_3;                                                            \
188
    _0 = vec_mergeh (x0,x1);                                                  \
189
    _1 = vec_mergeh (x2,x3);                                                  \
190
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
191
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
192
    vec_st (_2, 0*16, (T *)ptr);                                              \
193
    vec_st (_3, 1*16, (T *)ptr);                                              \
194
    _0 = vec_mergel (x0,x1);                                                  \
195
    _1 = vec_mergel (x2,x3);                                                  \
196
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
197
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
198
    vec_st (_2, 2*16, (T *)ptr);                                              \
199
    vec_st (_3, 3*16, (T *)ptr);                                              \
200
    ptr += 4;                                                                 \
201
}  while (0)
202

    
203
/*
204

205
  | 1     0       1.4021   | | Y |
206
  | 1    -0.3441 -0.7142   |x| Cb|
207
  | 1     1.7718  0        | | Cr|
208

209

210
  Y:      [-128 127]
211
  Cb/Cr : [-128 127]
212

213
  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
214

215
*/
216

    
217

    
218

    
219

    
220
#define vec_unh(x) \
221
    (vector signed short) \
222
        vec_perm(x,(__typeof__(x)){0}, \
223
                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
224
                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
225
#define vec_unl(x) \
226
    (vector signed short) \
227
        vec_perm(x,(__typeof__(x)){0}, \
228
                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
229
                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
230

    
231
#define vec_clip_s16(x) \
232
    vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
233
                         ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
234

    
235
#define vec_packclp(x,y) \
236
    (vector unsigned char)vec_packs \
237
        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
238
         (vector unsigned short)vec_max (y,((vector signed short) {0})))
239

    
240
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,a,a,ptr)
241

    
242

    
243
static inline void cvtyuvtoRGB (SwsContext *c,
244
                                vector signed short Y, vector signed short U, vector signed short V,
245
                                vector signed short *R, vector signed short *G, vector signed short *B)
246
{
247
    vector signed   short vx,ux,uvx;
248

    
249
    Y = vec_mradds (Y, c->CY, c->OY);
250
    U  = vec_sub (U,(vector signed short)
251
                    vec_splat((vector signed short){128},0));
252
    V  = vec_sub (V,(vector signed short)
253
                    vec_splat((vector signed short){128},0));
254

    
255
    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
256
    ux = vec_sl (U, c->CSHIFT);
257
    *B = vec_mradds (ux, c->CBU, Y);
258

    
259
    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
260
    vx = vec_sl (V, c->CSHIFT);
261
    *R = vec_mradds (vx, c->CRV, Y);
262

    
263
    // uvx = ((CGU*u) + (CGV*v))>>15;
264
    uvx = vec_mradds (U, c->CGU, Y);
265
    *G  = vec_mradds (V, c->CGV, uvx);
266
}
267

    
268

    
269
/*
270
  ------------------------------------------------------------------------------
271
  CS converters
272
  ------------------------------------------------------------------------------
273
*/
274

    
275

    
276
#define DEFCSP420_CVT(name,out_pixels)                                  \
277
static int altivec_##name (SwsContext *c,                               \
278
                           const unsigned char **in, int *instrides,    \
279
                           int srcSliceY,        int srcSliceH,         \
280
                           unsigned char **oplanes, int *outstrides)    \
281
{                                                                       \
282
    int w = c->srcW;                                                    \
283
    int h = srcSliceH;                                                  \
284
    int i,j;                                                            \
285
    int instrides_scl[3];                                               \
286
    vector unsigned char y0,y1;                                         \
287
                                                                        \
288
    vector signed char  u,v;                                            \
289
                                                                        \
290
    vector signed short Y0,Y1,Y2,Y3;                                    \
291
    vector signed short U,V;                                            \
292
    vector signed short vx,ux,uvx;                                      \
293
    vector signed short vx0,ux0,uvx0;                                   \
294
    vector signed short vx1,ux1,uvx1;                                   \
295
    vector signed short R0,G0,B0;                                       \
296
    vector signed short R1,G1,B1;                                       \
297
    vector unsigned char R,G,B;                                         \
298
                                                                        \
299
    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
300
    vector unsigned char align_perm;                                    \
301
                                                                        \
302
    vector signed short                                                 \
303
        lCY  = c->CY,                                                   \
304
        lOY  = c->OY,                                                   \
305
        lCRV = c->CRV,                                                  \
306
        lCBU = c->CBU,                                                  \
307
        lCGU = c->CGU,                                                  \
308
        lCGV = c->CGV;                                                  \
309
                                                                        \
310
    vector unsigned short lCSHIFT = c->CSHIFT;                          \
311
                                                                        \
312
    const ubyte *y1i   = in[0];                                         \
313
    const ubyte *y2i   = in[0]+instrides[0];                            \
314
    const ubyte *ui    = in[1];                                         \
315
    const ubyte *vi    = in[2];                                         \
316
                                                                        \
317
    vector unsigned char *oute                                          \
318
        = (vector unsigned char *)                                      \
319
            (oplanes[0]+srcSliceY*outstrides[0]);                       \
320
    vector unsigned char *outo                                          \
321
        = (vector unsigned char *)                                      \
322
            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
323
                                                                        \
324
                                                                        \
325
    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
326
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
327
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
328
                                                                        \
329
                                                                        \
330
    for (i=0;i<h/2;i++) {                                               \
331
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
332
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
333
                                                                        \
334
        for (j=0;j<w/16;j++) {                                          \
335
                                                                        \
336
            y1ivP = (vector unsigned char *)y1i;                        \
337
            y2ivP = (vector unsigned char *)y2i;                        \
338
            uivP  = (vector unsigned char *)ui;                         \
339
            vivP  = (vector unsigned char *)vi;                         \
340
                                                                        \
341
            align_perm = vec_lvsl (0, y1i);                             \
342
            y0 = (vector unsigned char)                                 \
343
                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
344
                                                                        \
345
            align_perm = vec_lvsl (0, y2i);                             \
346
            y1 = (vector unsigned char)                                 \
347
                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
348
                                                                        \
349
            align_perm = vec_lvsl (0, ui);                              \
350
            u = (vector signed char)                                    \
351
                vec_perm (uivP[0], uivP[1], align_perm);                \
352
                                                                        \
353
            align_perm = vec_lvsl (0, vi);                              \
354
            v = (vector signed char)                                    \
355
                vec_perm (vivP[0], vivP[1], align_perm);                \
356
                                                                        \
357
            u  = (vector signed char)                                   \
358
                 vec_sub (u,(vector signed char)                        \
359
                          vec_splat((vector signed char){128},0));      \
360
            v  = (vector signed char)                                   \
361
                 vec_sub (v,(vector signed char)                        \
362
                          vec_splat((vector signed char){128},0));      \
363
                                                                        \
364
            U  = vec_unpackh (u);                                       \
365
            V  = vec_unpackh (v);                                       \
366
                                                                        \
367
                                                                        \
368
            Y0 = vec_unh (y0);                                          \
369
            Y1 = vec_unl (y0);                                          \
370
            Y2 = vec_unh (y1);                                          \
371
            Y3 = vec_unl (y1);                                          \
372
                                                                        \
373
            Y0 = vec_mradds (Y0, lCY, lOY);                             \
374
            Y1 = vec_mradds (Y1, lCY, lOY);                             \
375
            Y2 = vec_mradds (Y2, lCY, lOY);                             \
376
            Y3 = vec_mradds (Y3, lCY, lOY);                             \
377
                                                                        \
378
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
379
            ux = vec_sl (U, lCSHIFT);                                   \
380
            ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
381
            ux0  = vec_mergeh (ux,ux);                                  \
382
            ux1  = vec_mergel (ux,ux);                                  \
383
                                                                        \
384
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
385
            vx = vec_sl (V, lCSHIFT);                                   \
386
            vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
387
            vx0  = vec_mergeh (vx,vx);                                  \
388
            vx1  = vec_mergel (vx,vx);                                  \
389
                                                                        \
390
            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
391
            uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
392
            uvx = vec_mradds (V, lCGV, uvx);                            \
393
            uvx0 = vec_mergeh (uvx,uvx);                                \
394
            uvx1 = vec_mergel (uvx,uvx);                                \
395
                                                                        \
396
            R0 = vec_add (Y0,vx0);                                      \
397
            G0 = vec_add (Y0,uvx0);                                     \
398
            B0 = vec_add (Y0,ux0);                                      \
399
            R1 = vec_add (Y1,vx1);                                      \
400
            G1 = vec_add (Y1,uvx1);                                     \
401
            B1 = vec_add (Y1,ux1);                                      \
402
                                                                        \
403
            R  = vec_packclp (R0,R1);                                   \
404
            G  = vec_packclp (G0,G1);                                   \
405
            B  = vec_packclp (B0,B1);                                   \
406
                                                                        \
407
            out_pixels(R,G,B,oute);                                     \
408
                                                                        \
409
            R0 = vec_add (Y2,vx0);                                      \
410
            G0 = vec_add (Y2,uvx0);                                     \
411
            B0 = vec_add (Y2,ux0);                                      \
412
            R1 = vec_add (Y3,vx1);                                      \
413
            G1 = vec_add (Y3,uvx1);                                     \
414
            B1 = vec_add (Y3,ux1);                                      \
415
            R  = vec_packclp (R0,R1);                                   \
416
            G  = vec_packclp (G0,G1);                                   \
417
            B  = vec_packclp (B0,B1);                                   \
418
                                                                        \
419
                                                                        \
420
            out_pixels(R,G,B,outo);                                     \
421
                                                                        \
422
            y1i  += 16;                                                 \
423
            y2i  += 16;                                                 \
424
            ui   += 8;                                                  \
425
            vi   += 8;                                                  \
426
                                                                        \
427
        }                                                               \
428
                                                                        \
429
        outo  += (outstrides[0])>>4;                                    \
430
        oute  += (outstrides[0])>>4;                                    \
431
                                                                        \
432
        ui    += instrides_scl[1];                                      \
433
        vi    += instrides_scl[2];                                      \
434
        y1i   += instrides_scl[0];                                      \
435
        y2i   += instrides_scl[0];                                      \
436
    }                                                                   \
437
    return srcSliceH;                                                   \
438
}
439

    
440

    
441
#define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),c,b,a,ptr)
442
#define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){255}),ptr)
443
#define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){255}),ptr)
444
#define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){255}),a,b,c,ptr)
445
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
446
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
447

    
448
DEFCSP420_CVT (yuv2_abgr, out_abgr)
449
#if 1
450
DEFCSP420_CVT (yuv2_bgra, out_bgra)
451
#else
452
static int altivec_yuv2_bgra32 (SwsContext *c,
453
                                unsigned char **in, int *instrides,
454
                                int srcSliceY,        int srcSliceH,
455
                                unsigned char **oplanes, int *outstrides)
456
{
457
    int w = c->srcW;
458
    int h = srcSliceH;
459
    int i,j;
460
    int instrides_scl[3];
461
    vector unsigned char y0,y1;
462

    
463
    vector signed char  u,v;
464

    
465
    vector signed short Y0,Y1,Y2,Y3;
466
    vector signed short U,V;
467
    vector signed short vx,ux,uvx;
468
    vector signed short vx0,ux0,uvx0;
469
    vector signed short vx1,ux1,uvx1;
470
    vector signed short R0,G0,B0;
471
    vector signed short R1,G1,B1;
472
    vector unsigned char R,G,B;
473

    
474
    vector unsigned char *uivP, *vivP;
475
    vector unsigned char align_perm;
476

    
477
    vector signed short
478
        lCY  = c->CY,
479
        lOY  = c->OY,
480
        lCRV = c->CRV,
481
        lCBU = c->CBU,
482
        lCGU = c->CGU,
483
        lCGV = c->CGV;
484

    
485
    vector unsigned short lCSHIFT = c->CSHIFT;
486

    
487
    ubyte *y1i   = in[0];
488
    ubyte *y2i   = in[0]+w;
489
    ubyte *ui    = in[1];
490
    ubyte *vi    = in[2];
491

    
492
    vector unsigned char *oute
493
        = (vector unsigned char *)
494
          (oplanes[0]+srcSliceY*outstrides[0]);
495
    vector unsigned char *outo
496
        = (vector unsigned char *)
497
          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
498

    
499

    
500
    instrides_scl[0] = instrides[0];
501
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
502
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
503

    
504

    
505
    for (i=0;i<h/2;i++) {
506
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
507
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
508

    
509
        for (j=0;j<w/16;j++) {
510

    
511
            y0 = vec_ldl (0,y1i);
512
            y1 = vec_ldl (0,y2i);
513
            uivP = (vector unsigned char *)ui;
514
            vivP = (vector unsigned char *)vi;
515

    
516
            align_perm = vec_lvsl (0, ui);
517
            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
518

    
519
            align_perm = vec_lvsl (0, vi);
520
            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
521
            u  = (vector signed char)
522
                 vec_sub (u,(vector signed char)
523
                          vec_splat((vector signed char){128},0));
524

    
525
            v  = (vector signed char)
526
                 vec_sub (v, (vector signed char)
527
                          vec_splat((vector signed char){128},0));
528

    
529
            U  = vec_unpackh (u);
530
            V  = vec_unpackh (v);
531

    
532

    
533
            Y0 = vec_unh (y0);
534
            Y1 = vec_unl (y0);
535
            Y2 = vec_unh (y1);
536
            Y3 = vec_unl (y1);
537

    
538
            Y0 = vec_mradds (Y0, lCY, lOY);
539
            Y1 = vec_mradds (Y1, lCY, lOY);
540
            Y2 = vec_mradds (Y2, lCY, lOY);
541
            Y3 = vec_mradds (Y3, lCY, lOY);
542

    
543
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
544
            ux = vec_sl (U, lCSHIFT);
545
            ux = vec_mradds (ux, lCBU, (vector signed short){0});
546
            ux0  = vec_mergeh (ux,ux);
547
            ux1  = vec_mergel (ux,ux);
548

    
549
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
550
            vx = vec_sl (V, lCSHIFT);
551
            vx = vec_mradds (vx, lCRV, (vector signed short){0});
552
            vx0  = vec_mergeh (vx,vx);
553
            vx1  = vec_mergel (vx,vx);
554
            /* uvx = ((CGU*u) + (CGV*v))>>15 */
555
            uvx = vec_mradds (U, lCGU, (vector signed short){0});
556
            uvx = vec_mradds (V, lCGV, uvx);
557
            uvx0 = vec_mergeh (uvx,uvx);
558
            uvx1 = vec_mergel (uvx,uvx);
559
            R0 = vec_add (Y0,vx0);
560
            G0 = vec_add (Y0,uvx0);
561
            B0 = vec_add (Y0,ux0);
562
            R1 = vec_add (Y1,vx1);
563
            G1 = vec_add (Y1,uvx1);
564
            B1 = vec_add (Y1,ux1);
565
            R  = vec_packclp (R0,R1);
566
            G  = vec_packclp (G0,G1);
567
            B  = vec_packclp (B0,B1);
568

    
569
            out_argb(R,G,B,oute);
570
            R0 = vec_add (Y2,vx0);
571
            G0 = vec_add (Y2,uvx0);
572
            B0 = vec_add (Y2,ux0);
573
            R1 = vec_add (Y3,vx1);
574
            G1 = vec_add (Y3,uvx1);
575
            B1 = vec_add (Y3,ux1);
576
            R  = vec_packclp (R0,R1);
577
            G  = vec_packclp (G0,G1);
578
            B  = vec_packclp (B0,B1);
579

    
580
            out_argb(R,G,B,outo);
581
            y1i  += 16;
582
            y2i  += 16;
583
            ui   += 8;
584
            vi   += 8;
585

    
586
        }
587

    
588
        outo  += (outstrides[0])>>4;
589
        oute  += (outstrides[0])>>4;
590

    
591
        ui    += instrides_scl[1];
592
        vi    += instrides_scl[2];
593
        y1i   += instrides_scl[0];
594
        y2i   += instrides_scl[0];
595
    }
596
    return srcSliceH;
597
}
598

    
599
#endif
600

    
601

    
602
DEFCSP420_CVT (yuv2_rgba, out_rgba)
603
DEFCSP420_CVT (yuv2_argb, out_argb)
604
DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
605
DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
606

    
607

    
608
// uyvy|uyvy|uyvy|uyvy
609
// 0123 4567 89ab cdef
610
static
611
const vector unsigned char
612
    demux_u = {0x10,0x00,0x10,0x00,
613
               0x10,0x04,0x10,0x04,
614
               0x10,0x08,0x10,0x08,
615
               0x10,0x0c,0x10,0x0c},
616
    demux_v = {0x10,0x02,0x10,0x02,
617
               0x10,0x06,0x10,0x06,
618
               0x10,0x0A,0x10,0x0A,
619
               0x10,0x0E,0x10,0x0E},
620
    demux_y = {0x10,0x01,0x10,0x03,
621
               0x10,0x05,0x10,0x07,
622
               0x10,0x09,0x10,0x0B,
623
               0x10,0x0D,0x10,0x0F};
624

    
625
/*
626
  this is so I can play live CCIR raw video
627
*/
628
static int altivec_uyvy_rgb32 (SwsContext *c,
629
                               const unsigned char **in, int *instrides,
630
                               int srcSliceY,        int srcSliceH,
631
                               unsigned char **oplanes, int *outstrides)
632
{
633
    int w = c->srcW;
634
    int h = srcSliceH;
635
    int i,j;
636
    vector unsigned char uyvy;
637
    vector signed   short Y,U,V;
638
    vector signed   short R0,G0,B0,R1,G1,B1;
639
    vector unsigned char  R,G,B;
640
    vector unsigned char *out;
641
    const ubyte *img;
642

    
643
    img = in[0];
644
    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
645

    
646
    for (i=0;i<h;i++) {
647
        for (j=0;j<w/16;j++) {
648
            uyvy = vec_ld (0, img);
649
            U = (vector signed short)
650
                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
651

    
652
            V = (vector signed short)
653
                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
654

    
655
            Y = (vector signed short)
656
                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
657

    
658
            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
659

    
660
            uyvy = vec_ld (16, img);
661
            U = (vector signed short)
662
                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
663

    
664
            V = (vector signed short)
665
                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
666

    
667
            Y = (vector signed short)
668
                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
669

    
670
            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
671

    
672
            R  = vec_packclp (R0,R1);
673
            G  = vec_packclp (G0,G1);
674
            B  = vec_packclp (B0,B1);
675

    
676
            //      vec_mstbgr24 (R,G,B, out);
677
            out_rgba (R,G,B,out);
678

    
679
            img += 32;
680
        }
681
    }
682
    return srcSliceH;
683
}
684

    
685

    
686

    
687
/* Ok currently the acceleration routine only supports
688
   inputs of widths a multiple of 16
689
   and heights a multiple 2
690

691
   So we just fall back to the C codes for this.
692
*/
693
SwsFunc ff_yuv2rgb_init_altivec(SwsContext *c)
694
{
695
    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
696
        return NULL;
697

    
698
    /*
699
      and this seems not to matter too much I tried a bunch of
700
      videos with abnormal widths and MPlayer crashes elsewhere.
701
      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
702
      boom with X11 bad match.
703

704
    */
705
    if ((c->srcW & 0xf) != 0)    return NULL;
706

    
707
    switch (c->srcFormat) {
708
    case PIX_FMT_YUV410P:
709
    case PIX_FMT_YUV420P:
710
    /*case IMGFMT_CLPL:        ??? */
711
    case PIX_FMT_GRAY8:
712
    case PIX_FMT_NV12:
713
    case PIX_FMT_NV21:
714
        if ((c->srcH & 0x1) != 0)
715
            return NULL;
716

    
717
        switch(c->dstFormat) {
718
        case PIX_FMT_RGB24:
719
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
720
            return altivec_yuv2_rgb24;
721
        case PIX_FMT_BGR24:
722
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
723
            return altivec_yuv2_bgr24;
724
        case PIX_FMT_ARGB:
725
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
726
            return altivec_yuv2_argb;
727
        case PIX_FMT_ABGR:
728
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
729
            return altivec_yuv2_abgr;
730
        case PIX_FMT_RGBA:
731
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
732
            return altivec_yuv2_rgba;
733
        case PIX_FMT_BGRA:
734
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
735
            return altivec_yuv2_bgra;
736
        default: return NULL;
737
        }
738
        break;
739

    
740
    case PIX_FMT_UYVY422:
741
        switch(c->dstFormat) {
742
        case PIX_FMT_BGR32:
743
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
744
            return altivec_uyvy_rgb32;
745
        default: return NULL;
746
        }
747
        break;
748

    
749
    }
750
    return NULL;
751
}
752

    
753
void ff_yuv2rgb_init_tables_altivec(SwsContext *c, const int inv_table[4], int brightness, int contrast, int saturation)
754
{
755
    union {
756
        DECLARE_ALIGNED(16, signed short, tmp)[8];
757
        vector signed short vec;
758
    } buf;
759

    
760
    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
761
    buf.tmp[1] =  -256*brightness;                                      //oy
762
    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
763
    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
764
    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
765
    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
766

    
767

    
768
    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
769
    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
770
    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
771
    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
772
    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
773
    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
774
    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
775
    return;
776
}
777

    
778

    
779
void
780
ff_yuv2packedX_altivec(SwsContext *c,
781
                       const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
782
                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
783
                     uint8_t *dest, int dstW, int dstY)
784
{
785
    int i,j;
786
    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
787
    vector signed short R0,G0,B0,R1,G1,B1;
788

    
789
    vector unsigned char R,G,B;
790
    vector unsigned char *out,*nout;
791

    
792
    vector signed short   RND = vec_splat_s16(1<<3);
793
    vector unsigned short SCL = vec_splat_u16(4);
794
    DECLARE_ALIGNED(16, unsigned long, scratch)[16];
795

    
796
    vector signed short *YCoeffs, *CCoeffs;
797

    
798
    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
799
    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
800

    
801
    out = (vector unsigned char *)dest;
802

    
803
    for (i=0; i<dstW; i+=16) {
804
        Y0 = RND;
805
        Y1 = RND;
806
        /* extract 16 coeffs from lumSrc */
807
        for (j=0; j<lumFilterSize; j++) {
808
            X0 = vec_ld (0,  &lumSrc[j][i]);
809
            X1 = vec_ld (16, &lumSrc[j][i]);
810
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
811
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
812
        }
813

    
814
        U = RND;
815
        V = RND;
816
        /* extract 8 coeffs from U,V */
817
        for (j=0; j<chrFilterSize; j++) {
818
            X  = vec_ld (0, &chrSrc[j][i/2]);
819
            U  = vec_mradds (X, CCoeffs[j], U);
820
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
821
            V  = vec_mradds (X, CCoeffs[j], V);
822
        }
823

    
824
        /* scale and clip signals */
825
        Y0 = vec_sra (Y0, SCL);
826
        Y1 = vec_sra (Y1, SCL);
827
        U  = vec_sra (U,  SCL);
828
        V  = vec_sra (V,  SCL);
829

    
830
        Y0 = vec_clip_s16 (Y0);
831
        Y1 = vec_clip_s16 (Y1);
832
        U  = vec_clip_s16 (U);
833
        V  = vec_clip_s16 (V);
834

    
835
        /* now we have
836
          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
837
          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
838

839
          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
840
          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
841
          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
842
        */
843

    
844
        U0 = vec_mergeh (U,U);
845
        V0 = vec_mergeh (V,V);
846

    
847
        U1 = vec_mergel (U,U);
848
        V1 = vec_mergel (V,V);
849

    
850
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
851
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
852

    
853
        R  = vec_packclp (R0,R1);
854
        G  = vec_packclp (G0,G1);
855
        B  = vec_packclp (B0,B1);
856

    
857
        switch(c->dstFormat) {
858
        case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
859
        case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
860
        case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
861
        case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
862
        case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
863
        case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
864
        default:
865
            {
866
                /* If this is reached, the caller should have called yuv2packedXinC
867
                   instead. */
868
                static int printed_error_message;
869
                if (!printed_error_message) {
870
                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
871
                           sws_format_name(c->dstFormat));
872
                    printed_error_message=1;
873
                }
874
                return;
875
            }
876
        }
877
    }
878

    
879
    if (i < dstW) {
880
        i -= 16;
881

    
882
        Y0 = RND;
883
        Y1 = RND;
884
        /* extract 16 coeffs from lumSrc */
885
        for (j=0; j<lumFilterSize; j++) {
886
            X0 = vec_ld (0,  &lumSrc[j][i]);
887
            X1 = vec_ld (16, &lumSrc[j][i]);
888
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
889
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
890
        }
891

    
892
        U = RND;
893
        V = RND;
894
        /* extract 8 coeffs from U,V */
895
        for (j=0; j<chrFilterSize; j++) {
896
            X  = vec_ld (0, &chrSrc[j][i/2]);
897
            U  = vec_mradds (X, CCoeffs[j], U);
898
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
899
            V  = vec_mradds (X, CCoeffs[j], V);
900
        }
901

    
902
        /* scale and clip signals */
903
        Y0 = vec_sra (Y0, SCL);
904
        Y1 = vec_sra (Y1, SCL);
905
        U  = vec_sra (U,  SCL);
906
        V  = vec_sra (V,  SCL);
907

    
908
        Y0 = vec_clip_s16 (Y0);
909
        Y1 = vec_clip_s16 (Y1);
910
        U  = vec_clip_s16 (U);
911
        V  = vec_clip_s16 (V);
912

    
913
        /* now we have
914
           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
915
           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
916

917
           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
918
           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
919
           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
920
        */
921

    
922
        U0 = vec_mergeh (U,U);
923
        V0 = vec_mergeh (V,V);
924

    
925
        U1 = vec_mergel (U,U);
926
        V1 = vec_mergel (V,V);
927

    
928
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
929
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
930

    
931
        R  = vec_packclp (R0,R1);
932
        G  = vec_packclp (G0,G1);
933
        B  = vec_packclp (B0,B1);
934

    
935
        nout = (vector unsigned char *)scratch;
936
        switch(c->dstFormat) {
937
        case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
938
        case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
939
        case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
940
        case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
941
        case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
942
        case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
943
        default:
944
            /* Unreachable, I think. */
945
            av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
946
                   sws_format_name(c->dstFormat));
947
            return;
948
        }
949

    
950
        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
951
    }
952

    
953
}