Statistics
| Branch: | Revision:

ffmpeg / libswscale / yuv2rgb_altivec.c @ 9655ffb5

History | View | Annotate | Download (37.7 KB)

1
/*
2
 * AltiVec acceleration for colorspace conversion
3
 *
4
 * copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22

    
23
/*
24
Convert I420 YV12 to RGB in various formats,
25
  it rejects images that are not in 420 formats,
26
  it rejects images that don't have widths of multiples of 16,
27
  it rejects images that don't have heights of multiples of 2.
28
Reject defers to C simulation code.
29

30
Lots of optimizations to be done here.
31

32
1. Need to fix saturation code. I just couldn't get it to fly with packs
33
   and adds, so we currently use max/min to clip.
34

35
2. The inefficient use of chroma loading needs a bit of brushing up.
36

37
3. Analysis of pipeline stalls needs to be done. Use shark to identify
38
   pipeline stalls.
39

40

41
MODIFIED to calculate coeffs from currently selected color space.
42
MODIFIED core to be a macro where you specify the output format.
43
ADDED UYVY conversion which is never called due to some thing in swscale.
44
CORRECTED algorithim selection to be strict on input formats.
45
ADDED runtime detection of AltiVec.
46

47
ADDED altivec_yuv2packedX vertical scl + RGB converter
48

49
March 27,2004
50
PERFORMANCE ANALYSIS
51

52
The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53
used as test.
54
The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55
same sequence.
56

57
720 * 480 * 30  ~10MPS
58

59
so we have roughly 10 clocks per pixel. This is too high, something has
60
to be wrong.
61

62
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63
need for vec_min.
64

65
OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66
the input video frame, it was just decompressed so it probably resides in L1
67
caches. However, we are creating the output video stream. This needs to use the
68
DSTST instruction to optimize for the cache. We couple this with the fact that
69
we are not going to be visiting the input buffer again so we mark it Least
70
Recently Used. This shaves 25% of the processor cycles off.
71

72
Now memcpy is the largest mips consumer in the system, probably due
73
to the inefficient X11 stuff.
74

75
GL libraries seem to be very slow on this machine 1.33Ghz PB running
76
Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
77
a versioning issue, however I have libGL.1.2.dylib for both
78
machines. (We need to figure this out now.)
79

80
GL2 libraries work now with patch for RGB32.
81

82
NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
83

84
Integrated luma prescaling adjustment for saturation/contrast/brightness
85
adjustment.
86
*/
87

    
88
#include <stdio.h>
89
#include <stdlib.h>
90
#include <string.h>
91
#include <inttypes.h>
92
#include <assert.h>
93
#include "config.h"
94
#ifdef HAVE_MALLOC_H
95
#include <malloc.h>
96
#endif
97
#include "rgb2rgb.h"
98
#include "swscale.h"
99
#include "swscale_internal.h"
100

    
101
#undef PROFILE_THE_BEAST
102
#undef INC_SCALING
103

    
104
typedef unsigned char ubyte;
105
typedef signed char   sbyte;
106

    
107

    
108
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
109
   homogeneous vector registers x0,x1,x2 are interleaved with the
110
   following technique:
111

112
      o0 = vec_mergeh (x0,x1);
113
      o1 = vec_perm (o0, x2, perm_rgb_0);
114
      o2 = vec_perm (o0, x2, perm_rgb_1);
115
      o3 = vec_mergel (x0,x1);
116
      o4 = vec_perm (o3,o2,perm_rgb_2);
117
      o5 = vec_perm (o3,o2,perm_rgb_3);
118

119
  perm_rgb_0:   o0(RG).h v1(B) --> o1*
120
              0   1  2   3   4
121
             rgbr|gbrg|brgb|rgbr
122
             0010 0100 1001 0010
123
             0102 3145 2673 894A
124

125
  perm_rgb_1:   o0(RG).h v1(B) --> o2
126
              0   1  2   3   4
127
             gbrg|brgb|bbbb|bbbb
128
             0100 1001 1111 1111
129
             B5CD 6EF7 89AB CDEF
130

131
  perm_rgb_2:   o3(RG).l o2(rgbB.l) --> o4*
132
              0   1  2   3   4
133
             gbrg|brgb|rgbr|gbrg
134
             1111 1111 0010 0100
135
             89AB CDEF 0182 3945
136

137
  perm_rgb_2:   o3(RG).l o2(rgbB.l) ---> o5*
138
              0   1  2   3   4
139
             brgb|rgbr|gbrg|brgb
140
             1001 0010 0100 1001
141
             a67b 89cA BdCD eEFf
142

143
*/
144
static
145
const vector unsigned char
146
  perm_rgb_0 = {0x00,0x01,0x10,0x02,0x03,0x11,0x04,0x05,
147
                0x12,0x06,0x07,0x13,0x08,0x09,0x14,0x0a},
148
  perm_rgb_1 = {0x0b,0x15,0x0c,0x0d,0x16,0x0e,0x0f,0x17,
149
                0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f},
150
  perm_rgb_2 = {0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
151
                0x00,0x01,0x18,0x02,0x03,0x19,0x04,0x05},
152
  perm_rgb_3 = {0x1a,0x06,0x07,0x1b,0x08,0x09,0x1c,0x0a,
153
                0x0b,0x1d,0x0c,0x0d,0x1e,0x0e,0x0f,0x1f};
154

    
155
#define vec_merge3(x2,x1,x0,y0,y1,y2)       \
156
do {                                        \
157
    __typeof__(x0) o0,o2,o3;                \
158
        o0 = vec_mergeh (x0,x1);            \
159
        y0 = vec_perm (o0, x2, perm_rgb_0); \
160
        o2 = vec_perm (o0, x2, perm_rgb_1); \
161
        o3 = vec_mergel (x0,x1);            \
162
        y1 = vec_perm (o3,o2,perm_rgb_2);   \
163
        y2 = vec_perm (o3,o2,perm_rgb_3);   \
164
} while(0)
165

    
166
#define vec_mstbgr24(x0,x1,x2,ptr)      \
167
do {                                    \
168
    __typeof__(x0) _0,_1,_2;            \
169
    vec_merge3 (x0,x1,x2,_0,_1,_2);     \
170
    vec_st (_0, 0, ptr++);              \
171
    vec_st (_1, 0, ptr++);              \
172
    vec_st (_2, 0, ptr++);              \
173
}  while (0);
174

    
175
#define vec_mstrgb24(x0,x1,x2,ptr)      \
176
do {                                    \
177
    __typeof__(x0) _0,_1,_2;            \
178
    vec_merge3 (x2,x1,x0,_0,_1,_2);     \
179
    vec_st (_0, 0, ptr++);              \
180
    vec_st (_1, 0, ptr++);              \
181
    vec_st (_2, 0, ptr++);              \
182
}  while (0);
183

    
184
/* pack the pixels in rgb0 format
185
   msb R
186
   lsb 0
187
*/
188
#define vec_mstrgb32(T,x0,x1,x2,x3,ptr)                                       \
189
do {                                                                          \
190
    T _0,_1,_2,_3;                                                            \
191
    _0 = vec_mergeh (x0,x1);                                                  \
192
    _1 = vec_mergeh (x2,x3);                                                  \
193
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
194
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
195
    vec_st (_2, 0*16, (T *)ptr);                                              \
196
    vec_st (_3, 1*16, (T *)ptr);                                              \
197
    _0 = vec_mergel (x0,x1);                                                  \
198
    _1 = vec_mergel (x2,x3);                                                  \
199
    _2 = (T)vec_mergeh ((vector unsigned short)_0,(vector unsigned short)_1); \
200
    _3 = (T)vec_mergel ((vector unsigned short)_0,(vector unsigned short)_1); \
201
    vec_st (_2, 2*16, (T *)ptr);                                              \
202
    vec_st (_3, 3*16, (T *)ptr);                                              \
203
    ptr += 4;                                                                 \
204
}  while (0);
205

    
206
/*
207

208
  | 1     0       1.4021   | | Y |
209
  | 1    -0.3441 -0.7142   |x| Cb|
210
  | 1     1.7718  0        | | Cr|
211

212

213
  Y:      [-128 127]
214
  Cb/Cr : [-128 127]
215

216
  typical yuv conversion work on Y: 0-255 this version has been optimized for jpeg decode.
217

218
*/
219

    
220

    
221

    
222

    
223
#define vec_unh(x) \
224
    (vector signed short) \
225
        vec_perm(x,(__typeof__(x)){0}, \
226
                 ((vector unsigned char){0x10,0x00,0x10,0x01,0x10,0x02,0x10,0x03,\
227
                                         0x10,0x04,0x10,0x05,0x10,0x06,0x10,0x07}))
228
#define vec_unl(x) \
229
    (vector signed short) \
230
        vec_perm(x,(__typeof__(x)){0}, \
231
                 ((vector unsigned char){0x10,0x08,0x10,0x09,0x10,0x0A,0x10,0x0B,\
232
                                         0x10,0x0C,0x10,0x0D,0x10,0x0E,0x10,0x0F}))
233

    
234
#define vec_clip_s16(x) \
235
    vec_max (vec_min (x, ((vector signed short){235,235,235,235,235,235,235,235})), \
236
                         ((vector signed short){ 16, 16, 16, 16, 16, 16, 16, 16}))
237

    
238
#define vec_packclp(x,y) \
239
    (vector unsigned char)vec_packs \
240
        ((vector unsigned short)vec_max (x,((vector signed short) {0})), \
241
         (vector unsigned short)vec_max (y,((vector signed short) {0})))
242

    
243
//#define out_pixels(a,b,c,ptr) vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),a,a,a,ptr)
244

    
245

    
246
static inline void cvtyuvtoRGB (SwsContext *c,
247
                                vector signed short Y, vector signed short U, vector signed short V,
248
                                vector signed short *R, vector signed short *G, vector signed short *B)
249
{
250
    vector signed   short vx,ux,uvx;
251

    
252
    Y = vec_mradds (Y, c->CY, c->OY);
253
    U  = vec_sub (U,(vector signed short)
254
                    vec_splat((vector signed short){128},0));
255
    V  = vec_sub (V,(vector signed short)
256
                    vec_splat((vector signed short){128},0));
257

    
258
    //   ux  = (CBU*(u<<c->CSHIFT)+0x4000)>>15;
259
    ux = vec_sl (U, c->CSHIFT);
260
    *B = vec_mradds (ux, c->CBU, Y);
261

    
262
    // vx  = (CRV*(v<<c->CSHIFT)+0x4000)>>15;
263
    vx = vec_sl (V, c->CSHIFT);
264
    *R = vec_mradds (vx, c->CRV, Y);
265

    
266
    // uvx = ((CGU*u) + (CGV*v))>>15;
267
    uvx = vec_mradds (U, c->CGU, Y);
268
    *G  = vec_mradds (V, c->CGV, uvx);
269
}
270

    
271

    
272
/*
273
  ------------------------------------------------------------------------------
274
  CS converters
275
  ------------------------------------------------------------------------------
276
*/
277

    
278

    
279
#define DEFCSP420_CVT(name,out_pixels)                                  \
280
static int altivec_##name (SwsContext *c,                               \
281
                           unsigned char **in, int *instrides,          \
282
                           int srcSliceY,        int srcSliceH,         \
283
                           unsigned char **oplanes, int *outstrides)    \
284
{                                                                       \
285
    int w = c->srcW;                                                    \
286
    int h = srcSliceH;                                                  \
287
    int i,j;                                                            \
288
    int instrides_scl[3];                                               \
289
    vector unsigned char y0,y1;                                         \
290
                                                                        \
291
    vector signed char  u,v;                                            \
292
                                                                        \
293
    vector signed short Y0,Y1,Y2,Y3;                                    \
294
    vector signed short U,V;                                            \
295
    vector signed short vx,ux,uvx;                                      \
296
    vector signed short vx0,ux0,uvx0;                                   \
297
    vector signed short vx1,ux1,uvx1;                                   \
298
    vector signed short R0,G0,B0;                                       \
299
    vector signed short R1,G1,B1;                                       \
300
    vector unsigned char R,G,B;                                         \
301
                                                                        \
302
    vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP;                  \
303
    vector unsigned char align_perm;                                    \
304
                                                                        \
305
    vector signed short                                                 \
306
        lCY  = c->CY,                                                   \
307
        lOY  = c->OY,                                                   \
308
        lCRV = c->CRV,                                                  \
309
        lCBU = c->CBU,                                                  \
310
        lCGU = c->CGU,                                                  \
311
        lCGV = c->CGV;                                                  \
312
                                                                        \
313
    vector unsigned short lCSHIFT = c->CSHIFT;                          \
314
                                                                        \
315
    ubyte *y1i   = in[0];                                               \
316
    ubyte *y2i   = in[0]+instrides[0];                                  \
317
    ubyte *ui    = in[1];                                               \
318
    ubyte *vi    = in[2];                                               \
319
                                                                        \
320
    vector unsigned char *oute                                          \
321
        = (vector unsigned char *)                                      \
322
            (oplanes[0]+srcSliceY*outstrides[0]);                       \
323
    vector unsigned char *outo                                          \
324
        = (vector unsigned char *)                                      \
325
            (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);         \
326
                                                                        \
327
                                                                        \
328
    instrides_scl[0] = instrides[0]*2-w;  /* the loop moves y{1,2}i by w */ \
329
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */    \
330
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */    \
331
                                                                        \
332
                                                                        \
333
    for (i=0;i<h/2;i++) {                                               \
334
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);          \
335
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);          \
336
                                                                        \
337
        for (j=0;j<w/16;j++) {                                          \
338
                                                                        \
339
            y1ivP = (vector unsigned char *)y1i;                        \
340
            y2ivP = (vector unsigned char *)y2i;                        \
341
            uivP  = (vector unsigned char *)ui;                         \
342
            vivP  = (vector unsigned char *)vi;                         \
343
                                                                        \
344
            align_perm = vec_lvsl (0, y1i);                             \
345
            y0 = (vector unsigned char)                                 \
346
                 vec_perm (y1ivP[0], y1ivP[1], align_perm);             \
347
                                                                        \
348
            align_perm = vec_lvsl (0, y2i);                             \
349
            y1 = (vector unsigned char)                                 \
350
                 vec_perm (y2ivP[0], y2ivP[1], align_perm);             \
351
                                                                        \
352
            align_perm = vec_lvsl (0, ui);                              \
353
            u = (vector signed char)                                    \
354
                vec_perm (uivP[0], uivP[1], align_perm);                \
355
                                                                        \
356
            align_perm = vec_lvsl (0, vi);                              \
357
            v = (vector signed char)                                    \
358
                vec_perm (vivP[0], vivP[1], align_perm);                \
359
                                                                        \
360
            u  = (vector signed char)                                   \
361
                 vec_sub (u,(vector signed char)                        \
362
                          vec_splat((vector signed char){128},0));      \
363
            v  = (vector signed char)                                   \
364
                 vec_sub (v,(vector signed char)                        \
365
                          vec_splat((vector signed char){128},0));      \
366
                                                                        \
367
            U  = vec_unpackh (u);                                       \
368
            V  = vec_unpackh (v);                                       \
369
                                                                        \
370
                                                                        \
371
            Y0 = vec_unh (y0);                                          \
372
            Y1 = vec_unl (y0);                                          \
373
            Y2 = vec_unh (y1);                                          \
374
            Y3 = vec_unl (y1);                                          \
375
                                                                        \
376
            Y0 = vec_mradds (Y0, lCY, lOY);                             \
377
            Y1 = vec_mradds (Y1, lCY, lOY);                             \
378
            Y2 = vec_mradds (Y2, lCY, lOY);                             \
379
            Y3 = vec_mradds (Y3, lCY, lOY);                             \
380
                                                                        \
381
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */                  \
382
            ux = vec_sl (U, lCSHIFT);                                   \
383
            ux = vec_mradds (ux, lCBU, (vector signed short){0});       \
384
            ux0  = vec_mergeh (ux,ux);                                  \
385
            ux1  = vec_mergel (ux,ux);                                  \
386
                                                                        \
387
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */            \
388
            vx = vec_sl (V, lCSHIFT);                                   \
389
            vx = vec_mradds (vx, lCRV, (vector signed short){0});       \
390
            vx0  = vec_mergeh (vx,vx);                                  \
391
            vx1  = vec_mergel (vx,vx);                                  \
392
                                                                        \
393
            /* uvx = ((CGU*u) + (CGV*v))>>15 */                         \
394
            uvx = vec_mradds (U, lCGU, (vector signed short){0});       \
395
            uvx = vec_mradds (V, lCGV, uvx);                            \
396
            uvx0 = vec_mergeh (uvx,uvx);                                \
397
            uvx1 = vec_mergel (uvx,uvx);                                \
398
                                                                        \
399
            R0 = vec_add (Y0,vx0);                                      \
400
            G0 = vec_add (Y0,uvx0);                                     \
401
            B0 = vec_add (Y0,ux0);                                      \
402
            R1 = vec_add (Y1,vx1);                                      \
403
            G1 = vec_add (Y1,uvx1);                                     \
404
            B1 = vec_add (Y1,ux1);                                      \
405
                                                                        \
406
            R  = vec_packclp (R0,R1);                                   \
407
            G  = vec_packclp (G0,G1);                                   \
408
            B  = vec_packclp (B0,B1);                                   \
409
                                                                        \
410
            out_pixels(R,G,B,oute);                                     \
411
                                                                        \
412
            R0 = vec_add (Y2,vx0);                                      \
413
            G0 = vec_add (Y2,uvx0);                                     \
414
            B0 = vec_add (Y2,ux0);                                      \
415
            R1 = vec_add (Y3,vx1);                                      \
416
            G1 = vec_add (Y3,uvx1);                                     \
417
            B1 = vec_add (Y3,ux1);                                      \
418
            R  = vec_packclp (R0,R1);                                   \
419
            G  = vec_packclp (G0,G1);                                   \
420
            B  = vec_packclp (B0,B1);                                   \
421
                                                                        \
422
                                                                        \
423
            out_pixels(R,G,B,outo);                                     \
424
                                                                        \
425
            y1i  += 16;                                                 \
426
            y2i  += 16;                                                 \
427
            ui   += 8;                                                  \
428
            vi   += 8;                                                  \
429
                                                                        \
430
        }                                                               \
431
                                                                        \
432
        outo  += (outstrides[0])>>4;                                    \
433
        oute  += (outstrides[0])>>4;                                    \
434
                                                                        \
435
        ui    += instrides_scl[1];                                      \
436
        vi    += instrides_scl[2];                                      \
437
        y1i   += instrides_scl[0];                                      \
438
        y2i   += instrides_scl[0];                                      \
439
    }                                                                   \
440
    return srcSliceH;                                                   \
441
}
442

    
443

    
444
#define out_abgr(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),c,b,a,ptr)
445
#define out_bgra(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),c,b,a,((__typeof__ (a)){0}),ptr)
446
#define out_rgba(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),a,b,c,((__typeof__ (a)){0}),ptr)
447
#define out_argb(a,b,c,ptr)  vec_mstrgb32(__typeof__(a),((__typeof__ (a)){0}),a,b,c,ptr)
448
#define out_rgb24(a,b,c,ptr) vec_mstrgb24(a,b,c,ptr)
449
#define out_bgr24(a,b,c,ptr) vec_mstbgr24(a,b,c,ptr)
450

    
451
DEFCSP420_CVT (yuv2_abgr, out_abgr)
452
#if 1
453
DEFCSP420_CVT (yuv2_bgra, out_bgra)
454
#else
455
static int altivec_yuv2_bgra32 (SwsContext *c,
456
                                unsigned char **in, int *instrides,
457
                                int srcSliceY,        int srcSliceH,
458
                                unsigned char **oplanes, int *outstrides)
459
{
460
    int w = c->srcW;
461
    int h = srcSliceH;
462
    int i,j;
463
    int instrides_scl[3];
464
    vector unsigned char y0,y1;
465

    
466
    vector signed char  u,v;
467

    
468
    vector signed short Y0,Y1,Y2,Y3;
469
    vector signed short U,V;
470
    vector signed short vx,ux,uvx;
471
    vector signed short vx0,ux0,uvx0;
472
    vector signed short vx1,ux1,uvx1;
473
    vector signed short R0,G0,B0;
474
    vector signed short R1,G1,B1;
475
    vector unsigned char R,G,B;
476

    
477
    vector unsigned char *uivP, *vivP;
478
    vector unsigned char align_perm;
479

    
480
    vector signed short
481
        lCY  = c->CY,
482
        lOY  = c->OY,
483
        lCRV = c->CRV,
484
        lCBU = c->CBU,
485
        lCGU = c->CGU,
486
        lCGV = c->CGV;
487

    
488
    vector unsigned short lCSHIFT = c->CSHIFT;
489

    
490
    ubyte *y1i   = in[0];
491
    ubyte *y2i   = in[0]+w;
492
    ubyte *ui    = in[1];
493
    ubyte *vi    = in[2];
494

    
495
    vector unsigned char *oute
496
        = (vector unsigned char *)
497
          (oplanes[0]+srcSliceY*outstrides[0]);
498
    vector unsigned char *outo
499
        = (vector unsigned char *)
500
          (oplanes[0]+srcSliceY*outstrides[0]+outstrides[0]);
501

    
502

    
503
    instrides_scl[0] = instrides[0];
504
    instrides_scl[1] = instrides[1]-w/2;  /* the loop moves ui by w/2 */
505
    instrides_scl[2] = instrides[2]-w/2;  /* the loop moves vi by w/2 */
506

    
507

    
508
    for (i=0;i<h/2;i++) {
509
        vec_dstst (outo, (0x02000002|(((w*3+32)/32)<<16)), 0);
510
        vec_dstst (oute, (0x02000002|(((w*3+32)/32)<<16)), 1);
511

    
512
        for (j=0;j<w/16;j++) {
513

    
514
            y0 = vec_ldl (0,y1i);
515
            y1 = vec_ldl (0,y2i);
516
            uivP = (vector unsigned char *)ui;
517
            vivP = (vector unsigned char *)vi;
518

    
519
            align_perm = vec_lvsl (0, ui);
520
            u  = (vector signed char)vec_perm (uivP[0], uivP[1], align_perm);
521

    
522
            align_perm = vec_lvsl (0, vi);
523
            v  = (vector signed char)vec_perm (vivP[0], vivP[1], align_perm);
524
            u  = (vector signed char)
525
                 vec_sub (u,(vector signed char)
526
                          vec_splat((vector signed char){128},0));
527

    
528
            v  = (vector signed char)
529
                 vec_sub (v, (vector signed char)
530
                          vec_splat((vector signed char){128},0));
531

    
532
            U  = vec_unpackh (u);
533
            V  = vec_unpackh (v);
534

    
535

    
536
            Y0 = vec_unh (y0);
537
            Y1 = vec_unl (y0);
538
            Y2 = vec_unh (y1);
539
            Y3 = vec_unl (y1);
540

    
541
            Y0 = vec_mradds (Y0, lCY, lOY);
542
            Y1 = vec_mradds (Y1, lCY, lOY);
543
            Y2 = vec_mradds (Y2, lCY, lOY);
544
            Y3 = vec_mradds (Y3, lCY, lOY);
545

    
546
            /*   ux  = (CBU*(u<<CSHIFT)+0x4000)>>15 */
547
            ux = vec_sl (U, lCSHIFT);
548
            ux = vec_mradds (ux, lCBU, (vector signed short){0});
549
            ux0  = vec_mergeh (ux,ux);
550
            ux1  = vec_mergel (ux,ux);
551

    
552
            /* vx  = (CRV*(v<<CSHIFT)+0x4000)>>15;        */
553
            vx = vec_sl (V, lCSHIFT);
554
            vx = vec_mradds (vx, lCRV, (vector signed short){0});
555
            vx0  = vec_mergeh (vx,vx);
556
            vx1  = vec_mergel (vx,vx);
557
            /* uvx = ((CGU*u) + (CGV*v))>>15 */
558
            uvx = vec_mradds (U, lCGU, (vector signed short){0});
559
            uvx = vec_mradds (V, lCGV, uvx);
560
            uvx0 = vec_mergeh (uvx,uvx);
561
            uvx1 = vec_mergel (uvx,uvx);
562
            R0 = vec_add (Y0,vx0);
563
            G0 = vec_add (Y0,uvx0);
564
            B0 = vec_add (Y0,ux0);
565
            R1 = vec_add (Y1,vx1);
566
            G1 = vec_add (Y1,uvx1);
567
            B1 = vec_add (Y1,ux1);
568
            R  = vec_packclp (R0,R1);
569
            G  = vec_packclp (G0,G1);
570
            B  = vec_packclp (B0,B1);
571

    
572
            out_argb(R,G,B,oute);
573
            R0 = vec_add (Y2,vx0);
574
            G0 = vec_add (Y2,uvx0);
575
            B0 = vec_add (Y2,ux0);
576
            R1 = vec_add (Y3,vx1);
577
            G1 = vec_add (Y3,uvx1);
578
            B1 = vec_add (Y3,ux1);
579
            R  = vec_packclp (R0,R1);
580
            G  = vec_packclp (G0,G1);
581
            B  = vec_packclp (B0,B1);
582

    
583
            out_argb(R,G,B,outo);
584
            y1i  += 16;
585
            y2i  += 16;
586
            ui   += 8;
587
            vi   += 8;
588

    
589
        }
590

    
591
        outo  += (outstrides[0])>>4;
592
        oute  += (outstrides[0])>>4;
593

    
594
        ui    += instrides_scl[1];
595
        vi    += instrides_scl[2];
596
        y1i   += instrides_scl[0];
597
        y2i   += instrides_scl[0];
598
    }
599
    return srcSliceH;
600
}
601

    
602
#endif
603

    
604

    
605
DEFCSP420_CVT (yuv2_rgba, out_rgba)
606
DEFCSP420_CVT (yuv2_argb, out_argb)
607
DEFCSP420_CVT (yuv2_rgb24,  out_rgb24)
608
DEFCSP420_CVT (yuv2_bgr24,  out_bgr24)
609

    
610

    
611
// uyvy|uyvy|uyvy|uyvy
612
// 0123 4567 89ab cdef
613
static
614
const vector unsigned char
615
    demux_u = {0x10,0x00,0x10,0x00,
616
               0x10,0x04,0x10,0x04,
617
               0x10,0x08,0x10,0x08,
618
               0x10,0x0c,0x10,0x0c},
619
    demux_v = {0x10,0x02,0x10,0x02,
620
               0x10,0x06,0x10,0x06,
621
               0x10,0x0A,0x10,0x0A,
622
               0x10,0x0E,0x10,0x0E},
623
    demux_y = {0x10,0x01,0x10,0x03,
624
               0x10,0x05,0x10,0x07,
625
               0x10,0x09,0x10,0x0B,
626
               0x10,0x0D,0x10,0x0F};
627

    
628
/*
629
  this is so I can play live CCIR raw video
630
*/
631
static int altivec_uyvy_rgb32 (SwsContext *c,
632
                               unsigned char **in, int *instrides,
633
                               int srcSliceY,        int srcSliceH,
634
                               unsigned char **oplanes, int *outstrides)
635
{
636
    int w = c->srcW;
637
    int h = srcSliceH;
638
    int i,j;
639
    vector unsigned char uyvy;
640
    vector signed   short Y,U,V;
641
    vector signed   short R0,G0,B0,R1,G1,B1;
642
    vector unsigned char  R,G,B;
643
    vector unsigned char *out;
644
    ubyte *img;
645

    
646
    img = in[0];
647
    out = (vector unsigned char *)(oplanes[0]+srcSliceY*outstrides[0]);
648

    
649
    for (i=0;i<h;i++) {
650
        for (j=0;j<w/16;j++) {
651
            uyvy = vec_ld (0, img);
652
            U = (vector signed short)
653
                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
654

    
655
            V = (vector signed short)
656
                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
657

    
658
            Y = (vector signed short)
659
                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
660

    
661
            cvtyuvtoRGB (c, Y,U,V,&R0,&G0,&B0);
662

    
663
            uyvy = vec_ld (16, img);
664
            U = (vector signed short)
665
                vec_perm (uyvy, (vector unsigned char){0}, demux_u);
666

    
667
            V = (vector signed short)
668
                vec_perm (uyvy, (vector unsigned char){0}, demux_v);
669

    
670
            Y = (vector signed short)
671
                vec_perm (uyvy, (vector unsigned char){0}, demux_y);
672

    
673
            cvtyuvtoRGB (c, Y,U,V,&R1,&G1,&B1);
674

    
675
            R  = vec_packclp (R0,R1);
676
            G  = vec_packclp (G0,G1);
677
            B  = vec_packclp (B0,B1);
678

    
679
            //      vec_mstbgr24 (R,G,B, out);
680
            out_rgba (R,G,B,out);
681

    
682
            img += 32;
683
        }
684
    }
685
    return srcSliceH;
686
}
687

    
688

    
689

    
690
/* Ok currently the acceleration routine only supports
691
   inputs of widths a multiple of 16
692
   and heights a multiple 2
693

694
   So we just fall back to the C codes for this.
695
*/
696
SwsFunc yuv2rgb_init_altivec (SwsContext *c)
697
{
698
    if (!(c->flags & SWS_CPU_CAPS_ALTIVEC))
699
        return NULL;
700

    
701
    /*
702
      and this seems not to matter too much I tried a bunch of
703
      videos with abnormal widths and MPlayer crashes elsewhere.
704
      mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
705
      boom with X11 bad match.
706

707
    */
708
    if ((c->srcW & 0xf) != 0)    return NULL;
709

    
710
    switch (c->srcFormat) {
711
    case PIX_FMT_YUV410P:
712
    case PIX_FMT_YUV420P:
713
    /*case IMGFMT_CLPL:        ??? */
714
    case PIX_FMT_GRAY8:
715
    case PIX_FMT_NV12:
716
    case PIX_FMT_NV21:
717
        if ((c->srcH & 0x1) != 0)
718
            return NULL;
719

    
720
        switch(c->dstFormat){
721
        case PIX_FMT_RGB24:
722
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
723
            return altivec_yuv2_rgb24;
724
        case PIX_FMT_BGR24:
725
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
726
            return altivec_yuv2_bgr24;
727
        case PIX_FMT_ARGB:
728
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
729
            return altivec_yuv2_argb;
730
        case PIX_FMT_ABGR:
731
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
732
            return altivec_yuv2_abgr;
733
        case PIX_FMT_RGBA:
734
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
735
            return altivec_yuv2_rgba;
736
        case PIX_FMT_BGRA:
737
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
738
            return altivec_yuv2_bgra;
739
        default: return NULL;
740
        }
741
        break;
742

    
743
    case PIX_FMT_UYVY422:
744
        switch(c->dstFormat){
745
        case PIX_FMT_BGR32:
746
            av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
747
            return altivec_uyvy_rgb32;
748
        default: return NULL;
749
        }
750
        break;
751

    
752
    }
753
    return NULL;
754
}
755

    
756
void yuv2rgb_altivec_init_tables (SwsContext *c, const int inv_table[4],int brightness,int contrast, int saturation)
757
{
758
    union {
759
        signed short tmp[8] __attribute__ ((aligned(16)));
760
        vector signed short vec;
761
    } buf;
762

    
763
    buf.tmp[0] =  ((0xffffLL) * contrast>>8)>>9;                        //cy
764
    buf.tmp[1] =  -256*brightness;                                      //oy
765
    buf.tmp[2] =  (inv_table[0]>>3) *(contrast>>16)*(saturation>>16);   //crv
766
    buf.tmp[3] =  (inv_table[1]>>3) *(contrast>>16)*(saturation>>16);   //cbu
767
    buf.tmp[4] = -((inv_table[2]>>1)*(contrast>>16)*(saturation>>16));  //cgu
768
    buf.tmp[5] = -((inv_table[3]>>1)*(contrast>>16)*(saturation>>16));  //cgv
769

    
770

    
771
    c->CSHIFT = (vector unsigned short)vec_splat_u16(2);
772
    c->CY   = vec_splat ((vector signed short)buf.vec, 0);
773
    c->OY   = vec_splat ((vector signed short)buf.vec, 1);
774
    c->CRV  = vec_splat ((vector signed short)buf.vec, 2);
775
    c->CBU  = vec_splat ((vector signed short)buf.vec, 3);
776
    c->CGU  = vec_splat ((vector signed short)buf.vec, 4);
777
    c->CGV  = vec_splat ((vector signed short)buf.vec, 5);
778
#if 0
779
    {
780
    int i;
781
    char *v[6]={"cy","oy","crv","cbu","cgu","cgv"};
782
    for (i=0; i<6; i++)
783
        printf("%s %d ", v[i],buf.tmp[i] );
784
        printf("\n");
785
    }
786
#endif
787
    return;
788
}
789

    
790

    
791
void
792
altivec_yuv2packedX (SwsContext *c,
793
                     int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
794
                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
795
                     uint8_t *dest, int dstW, int dstY)
796
{
797
    int i,j;
798
    vector signed short X,X0,X1,Y0,U0,V0,Y1,U1,V1,U,V;
799
    vector signed short R0,G0,B0,R1,G1,B1;
800

    
801
    vector unsigned char R,G,B;
802
    vector unsigned char *out,*nout;
803

    
804
    vector signed short   RND = vec_splat_s16(1<<3);
805
    vector unsigned short SCL = vec_splat_u16(4);
806
    unsigned long scratch[16] __attribute__ ((aligned (16)));
807

    
808
    vector signed short *YCoeffs, *CCoeffs;
809

    
810
    YCoeffs = c->vYCoeffsBank+dstY*lumFilterSize;
811
    CCoeffs = c->vCCoeffsBank+dstY*chrFilterSize;
812

    
813
    out = (vector unsigned char *)dest;
814

    
815
    for (i=0; i<dstW; i+=16){
816
        Y0 = RND;
817
        Y1 = RND;
818
        /* extract 16 coeffs from lumSrc */
819
        for (j=0; j<lumFilterSize; j++) {
820
            X0 = vec_ld (0,  &lumSrc[j][i]);
821
            X1 = vec_ld (16, &lumSrc[j][i]);
822
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
823
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
824
        }
825

    
826
        U = RND;
827
        V = RND;
828
        /* extract 8 coeffs from U,V */
829
        for (j=0; j<chrFilterSize; j++) {
830
            X  = vec_ld (0, &chrSrc[j][i/2]);
831
            U  = vec_mradds (X, CCoeffs[j], U);
832
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
833
            V  = vec_mradds (X, CCoeffs[j], V);
834
        }
835

    
836
        /* scale and clip signals */
837
        Y0 = vec_sra (Y0, SCL);
838
        Y1 = vec_sra (Y1, SCL);
839
        U  = vec_sra (U,  SCL);
840
        V  = vec_sra (V,  SCL);
841

    
842
        Y0 = vec_clip_s16 (Y0);
843
        Y1 = vec_clip_s16 (Y1);
844
        U  = vec_clip_s16 (U);
845
        V  = vec_clip_s16 (V);
846

    
847
        /* now we have
848
          Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
849
          U= u0 u1 u2 u3 u4 u5 u6 u7      V= v0 v1 v2 v3 v4 v5 v6 v7
850

851
          Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
852
          U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
853
          V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
854
        */
855

    
856
        U0 = vec_mergeh (U,U);
857
        V0 = vec_mergeh (V,V);
858

    
859
        U1 = vec_mergel (U,U);
860
        V1 = vec_mergel (V,V);
861

    
862
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
863
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
864

    
865
        R  = vec_packclp (R0,R1);
866
        G  = vec_packclp (G0,G1);
867
        B  = vec_packclp (B0,B1);
868

    
869
        switch(c->dstFormat) {
870
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,out); break;
871
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,out); break;
872
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,out); break;
873
            case PIX_FMT_ARGB:  out_argb  (R,G,B,out); break;
874
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,out); break;
875
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,out); break;
876
            default:
877
            {
878
                /* If this is reached, the caller should have called yuv2packedXinC
879
                   instead. */
880
                static int printed_error_message;
881
                if (!printed_error_message) {
882
                    av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
883
                           sws_format_name(c->dstFormat));
884
                    printed_error_message=1;
885
                }
886
                return;
887
            }
888
        }
889
    }
890

    
891
    if (i < dstW) {
892
        i -= 16;
893

    
894
        Y0 = RND;
895
        Y1 = RND;
896
        /* extract 16 coeffs from lumSrc */
897
        for (j=0; j<lumFilterSize; j++) {
898
            X0 = vec_ld (0,  &lumSrc[j][i]);
899
            X1 = vec_ld (16, &lumSrc[j][i]);
900
            Y0 = vec_mradds (X0, YCoeffs[j], Y0);
901
            Y1 = vec_mradds (X1, YCoeffs[j], Y1);
902
        }
903

    
904
        U = RND;
905
        V = RND;
906
        /* extract 8 coeffs from U,V */
907
        for (j=0; j<chrFilterSize; j++) {
908
            X  = vec_ld (0, &chrSrc[j][i/2]);
909
            U  = vec_mradds (X, CCoeffs[j], U);
910
            X  = vec_ld (0, &chrSrc[j][i/2+2048]);
911
            V  = vec_mradds (X, CCoeffs[j], V);
912
        }
913

    
914
        /* scale and clip signals */
915
        Y0 = vec_sra (Y0, SCL);
916
        Y1 = vec_sra (Y1, SCL);
917
        U  = vec_sra (U,  SCL);
918
        V  = vec_sra (V,  SCL);
919

    
920
        Y0 = vec_clip_s16 (Y0);
921
        Y1 = vec_clip_s16 (Y1);
922
        U  = vec_clip_s16 (U);
923
        V  = vec_clip_s16 (V);
924

    
925
        /* now we have
926
           Y0= y0 y1 y2 y3 y4 y5 y6 y7     Y1= y8 y9 y10 y11 y12 y13 y14 y15
927
           U = u0 u1 u2 u3 u4 u5 u6 u7     V = v0 v1 v2 v3 v4 v5 v6 v7
928

929
           Y0= y0 y1 y2 y3 y4 y5 y6 y7    Y1= y8 y9 y10 y11 y12 y13 y14 y15
930
           U0= u0 u0 u1 u1 u2 u2 u3 u3    U1= u4 u4 u5 u5 u6 u6 u7 u7
931
           V0= v0 v0 v1 v1 v2 v2 v3 v3    V1= v4 v4 v5 v5 v6 v6 v7 v7
932
        */
933

    
934
        U0 = vec_mergeh (U,U);
935
        V0 = vec_mergeh (V,V);
936

    
937
        U1 = vec_mergel (U,U);
938
        V1 = vec_mergel (V,V);
939

    
940
        cvtyuvtoRGB (c, Y0,U0,V0,&R0,&G0,&B0);
941
        cvtyuvtoRGB (c, Y1,U1,V1,&R1,&G1,&B1);
942

    
943
        R  = vec_packclp (R0,R1);
944
        G  = vec_packclp (G0,G1);
945
        B  = vec_packclp (B0,B1);
946

    
947
        nout = (vector unsigned char *)scratch;
948
        switch(c->dstFormat) {
949
            case PIX_FMT_ABGR:  out_abgr  (R,G,B,nout); break;
950
            case PIX_FMT_BGRA:  out_bgra  (R,G,B,nout); break;
951
            case PIX_FMT_RGBA:  out_rgba  (R,G,B,nout); break;
952
            case PIX_FMT_ARGB:  out_argb  (R,G,B,nout); break;
953
            case PIX_FMT_RGB24: out_rgb24 (R,G,B,nout); break;
954
            case PIX_FMT_BGR24: out_bgr24 (R,G,B,nout); break;
955
            default:
956
                /* Unreachable, I think. */
957
                av_log(c, AV_LOG_ERROR, "altivec_yuv2packedX doesn't support %s output\n",
958
                       sws_format_name(c->dstFormat));
959
                return;
960
        }
961

    
962
        memcpy (&((uint32_t*)dest)[i], scratch, (dstW-i)/4);
963
    }
964

    
965
}