Revision 8a322796

View differences:

libswscale/internal_bfin.S
2 2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 3
 *                    April 20, 2007
4 4
 *
5
 * Blackfin Video Color Space Converters Operations
6
 *  convert I420 YV12 to RGB in various formats,
5
 * Blackfin video color space converter operations
6
 * convert I420 YV12 to RGB in various formats
7 7
 *
8 8
 * This file is part of FFmpeg.
9 9
 *
......
24 24

  
25 25

  
26 26
/*
27
YUV420 to RGB565 conversion.  This routine takes a YUV 420 planar macroblock
28
and converts it to RGB565.  R:5 bits, G:6 bits, B:5 bits.. packed into shorts
27
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
28
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
29 29

  
30 30

  
31 31
The following calculation is used for the conversion:
......
34 34
  g = clipz((y-oy)*cy  + cgv*(v-128) + cgu*(u-128))
35 35
  b = clipz((y-oy)*cy  + cbu*(u-128))
36 36

  
37
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
37
y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
38 38

  
39 39

  
40 40
New factorization to eliminate the truncation error which was
41
occuring due to the byteop3p.
41
occurring due to the byteop3p.
42 42

  
43 43

  
44
1) use the bytop16m to subtract quad bytes we use this in U8 this
44
1) Use the bytop16m to subtract quad bytes we use this in U8 this
45 45
 then so the offsets need to be renormalized to 8bits.
46 46

  
47
2) scale operands up by a factor of 4 not 8 because Blackfin
47
2) Scale operands up by a factor of 4 not 8 because Blackfin
48 48
   multiplies include a shift.
49 49

  
50
3) compute into the accumulators cy*yx0, cy*yx1
50
3) Compute into the accumulators cy*yx0, cy*yx1.
51 51

  
52
4) compute each of the linear equations
52
4) Compute each of the linear equations:
53 53
     r = clipz((y - oy) * cy  + crv * (v - 128))
54 54

  
55 55
     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
56 56

  
57 57
     b = clipz((y - oy) * cy  + cbu * (u - 128))
58 58

  
59
   reuse of the accumulators requires that we actually multiply
60
   twice once with addition and the second time with a subtaction.
59
   Reuse of the accumulators requires that we actually multiply
60
   twice once with addition and the second time with a subtraction.
61 61

  
62
   because of this we need to compute the equations in the order R B
62
   Because of this we need to compute the equations in the order R B
63 63
   then G saving the writes for B in the case of 24/32 bit color
64 64
   formats.
65 65

  
66
   api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
66
   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
67 67
                      int dW, uint32_t *coeffs);
68 68

  
69 69
       A          B
......
77 77

  
78 78
coeffs is a pointer to oy.
79 79

  
80
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
replication is used to simplify the internal algorithms for the dual mac architecture
82
of BlackFin.
80
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
81
replication is used to simplify the internal algorithms for the dual Mac
82
architecture of BlackFin.
83 83

  
84
All routines are exported with _ff_bfin_ as a symbol prefix
84
All routines are exported with _ff_bfin_ as a symbol prefix.
85 85

  
86
rough performance gain compared against -O3:
86
Rough performance gain compared against -O3:
87 87

  
88 88
2779809/1484290 187.28%
89 89

  
libswscale/rgb2rgb.c
1 1
/*
2
 *  rgb2rgb.c, Software RGB to RGB convertor
3
 *  pluralize by Software PAL8 to RGB convertor
4
 *               Software YUV to YUV convertor
5
 *               Software YUV to RGB convertor
6
 *  Written by Nick Kurshev.
7
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 8
 *
9 9
 * This file is part of FFmpeg.
10 10
 *
......
22 22
 * along with FFmpeg; if not, write to the Free Software
23 23
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 24
 *
25
 * the C code (not assembly, mmx, ...) of this file can be used
26
 * under the LGPL license too
25
 * The C code (not assembly, MMX, ...) of this file can be used
26
 * under the LGPL license.
27 27
 */
28 28
#include <inttypes.h>
29 29
#include "config.h"
......
33 33
#include "swscale.h"
34 34
#include "swscale_internal.h"
35 35

  
36
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
36
#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
37 37

  
38 38
void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size);
39 39
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size);
......
149 149
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
150 150
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
151 151

  
152
//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
153
//Plain C versions
152
//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
153
//plain C versions
154 154
#undef HAVE_MMX
155 155
#undef HAVE_MMX2
156 156
#undef HAVE_3DNOW
......
190 190
#endif //ARCH_X86 || ARCH_X86_64
191 191

  
192 192
/*
193
 rgb15->rgb16 Original by Strepto/Astral
193
 RGB15->RGB16 original by Strepto/Astral
194 194
 ported to gcc & bugfixed : A'rpi
195 195
 MMX2, 3DNOW optimization by Nick Kurshev
196
 32bit c version, and and&add trick by Michael Niedermayer
196
 32-bit C version, and and&add trick by Michael Niedermayer
197 197
*/
198 198

  
199 199
void sws_rgb2rgb_init(int flags){
......
266 266
{
267 267
    long i;
268 268
/*
269
    writes 1 byte o much and might cause alignment issues on some architectures?
269
    Writes 1 byte too much and might cause alignment issues on some architectures?
270 270
    for (i=0; i<num_pixels; i++)
271 271
        ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
272 272
*/
......
284 284
{
285 285
    long i;
286 286
/*
287
    writes 1 byte o much and might cause alignment issues on some architectures?
287
    Writes 1 byte too much and might cause alignment issues on some architectures?
288 288
    for (i=0; i<num_pixels; i++)
289 289
        ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]];
290 290
*/
......
299 299
}
300 300

  
301 301
/**
302
 * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
302
 * Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
303 303
 */
304 304
void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette)
305 305
{
libswscale/rgb2rgb.h
1 1
/*
2
 *  rgb2rgb.h, Software RGB to RGB convertor
3
 *  pluralize by Software PAL8 to RGB convertor
4
 *               Software YUV to YUV convertor
5
 *               Software YUV to RGB convertor
2
 *  software RGB to RGB converter
3
 *  pluralize by Software PAL8 to RGB converter
4
 *               Software YUV to YUV converter
5
 *               Software YUV to RGB converter
6 6
 *  Written by Nick Kurshev.
7 7
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 8
 *
......
28 28

  
29 29
#include <inttypes.h>
30 30

  
31
/* A full collection of rgb to rgb(bgr) convertors */
31
/* A full collection of RGB to RGB(BGR) converters */
32 32
extern void (*rgb24to32)   (const uint8_t *src, uint8_t *dst, long src_size);
33 33
extern void (*rgb24to16)   (const uint8_t *src, uint8_t *dst, long src_size);
34 34
extern void (*rgb24to15)   (const uint8_t *src, uint8_t *dst, long src_size);
......
71 71
extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette);
72 72

  
73 73
/**
74
 *
75
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
76
 * problem for anyone then tell me, and ill fix it)
77
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
74
 * Height should be a multiple of 2 and width should be a multiple of 16.
75
 * (If this is a problem for anyone then tell me, and I will fix it.)
76
 * Chrominance data is only taken from every second line, others are ignored.
77
 * FIXME: Write HQ version.
78 78
 */
79 79
//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
80 80

  
81 81
/**
82
 *
83
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
84
 * problem for anyone then tell me, and ill fix it)
82
 * Height should be a multiple of 2 and width should be a multiple of 16.
83
 * (If this is a problem for anyone then tell me, and I will fix it.)
85 84
 */
86 85
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
87 86
                          long width, long height,
88 87
                          long lumStride, long chromStride, long dstStride);
89 88

  
90 89
/**
91
 *
92
 * width should be a multiple of 16
90
 * Width should be a multiple of 16.
93 91
 */
94 92
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
95 93
                             long width, long height,
96 94
                             long lumStride, long chromStride, long dstStride);
97 95

  
98 96
/**
99
 *
100
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
101
 * problem for anyone then tell me, and ill fix it)
97
 * Height should be a multiple of 2 and width should be a multiple of 16.
98
 * (If this is a problem for anyone then tell me, and I will fix it.)
102 99
 */
103 100
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
104 101
                          long width, long height,
105 102
                          long lumStride, long chromStride, long srcStride);
106 103

  
107 104
/**
108
 *
109
 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
110
 * problem for anyone then tell me, and ill fix it)
105
 * Height should be a multiple of 2 and width should be a multiple of 16.
106
 * (If this is a problem for anyone then tell me, and I will fix it.)
111 107
 */
112 108
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
113 109
                          long width, long height,
114 110
                          long lumStride, long chromStride, long dstStride);
115 111

  
116 112
/**
117
 *
118
 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
119
 * problem for anyone then tell me, and ill fix it)
120
 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
113
 * Height should be a multiple of 2 and width should be a multiple of 2.
114
 * (If this is a problem for anyone then tell me, and I will fix it.)
115
 * Chrominance data is only taken from every second line, others are ignored.
116
 * FIXME: Write HQ version.
121 117
 */
122 118
extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
123 119
                           long width, long height,
libswscale/rgb2rgb_template.c
1 1
/*
2
 *  rgb2rgb.c, Software RGB to RGB convertor
3
 *  pluralize by Software PAL8 to RGB convertor
4
 *               Software YUV to YUV convertor
5
 *               Software YUV to RGB convertor
6
 *  Written by Nick Kurshev.
7
 *  palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 *  lot of big-endian byteorder fixes by Alex Beregszaszi
2
 * software RGB to RGB converter
3
 * pluralize by software PAL8 to RGB converter
4
 *              software YUV to YUV converter
5
 *              software YUV to RGB converter
6
 * Written by Nick Kurshev.
7
 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8
 * lot of big-endian byte order fixes by Alex Beregszaszi
9 9
 *
10 10
 * This file is part of FFmpeg.
11 11
 *
......
23 23
 * along with FFmpeg; if not, write to the Free Software
24 24
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 25
 *
26
 * The C code (not assembly, mmx, ...) of this file can be used
26
 * The C code (not assembly, MMX, ...) of this file can be used
27 27
 * under the LGPL license.
28 28
 */
29 29

  
......
229 229
}
230 230

  
231 231
/*
232
 Original by Strepto/Astral
233
 ported to gcc & bugfixed : A'rpi
232
 original by Strepto/Astral
233
 ported to gcc & bugfixed: A'rpi
234 234
 MMX2, 3DNOW optimization by Nick Kurshev
235
 32 bit C version, and and&add trick by Michael Niedermayer
235
 32-bit C version, and and&add trick by Michael Niedermayer
236 236
*/
237 237
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
238 238
{
......
926 926
   ----------------
927 927
   1 1 0 1 1  1 1 0
928 928
   |=======|  |===|
929
       |      Leftmost Bits Repeated to Fill Open Bits
929
       |      leftmost bits repeated to fill open bits
930 930
       |
931
   Original Bits
931
   original bits
932 932
*/
933 933
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
934 934
{
......
1006 1006
        :"=m"(*d)
1007 1007
        :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1008 1008
        :"memory");
1009
        /* Borrowed 32 to 24 */
1009
        /* borrowed 32 to 24 */
1010 1010
        asm volatile(
1011 1011
        "movq       %%mm0, %%mm4    \n\t"
1012 1012
        "movq       %%mm3, %%mm5    \n\t"
......
1147 1147
        :"=m"(*d)
1148 1148
        :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1149 1149
        :"memory");
1150
        /* Borrowed 32 to 24 */
1150
        /* borrowed 32 to 24 */
1151 1151
        asm volatile(
1152 1152
        "movq       %%mm0, %%mm4    \n\t"
1153 1153
        "movq       %%mm3, %%mm5    \n\t"
......
1479 1479
    asm volatile(SFENCE:::"memory");
1480 1480
    asm volatile(EMMS:::"memory");
1481 1481

  
1482
    if (mmx_size==23) return; //finihsed, was multiple of 8
1482
    if (mmx_size==23) return; //finished, was multiple of 8
1483 1483

  
1484 1484
    src+= src_size;
1485 1485
    dst+= src_size;
......
1638 1638
}
1639 1639

  
1640 1640
/**
1641
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1642
 * this is a problem for anyone then tell me, and I will fix it).
1641
 * Height should be a multiple of 2 and width should be a multiple of 16.
1642
 * (If this is a problem for anyone then tell me, and I will fix it.)
1643 1643
 */
1644 1644
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1645 1645
                                      long width, long height,
......
1720 1720
                (vc[0] << 8) + (yc[1] << 0);
1721 1721
#else
1722 1722
            *idst++ = uc[0] + (yc[0] << 8) +
1723
                (vc[0] << 16) + (yc[1] << 24);
1723
               (vc[0] << 16) + (yc[1] << 24);
1724 1724
#endif
1725 1725
            yc += 2;
1726 1726
            uc++;
......
1744 1744
}
1745 1745

  
1746 1746
/**
1747
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1748
 * this is a problem for anyone then tell me, and I will fix it).
1747
 * Height should be a multiple of 2 and width should be a multiple of 16
1748
 * (If this is a problem for anyone then tell me, and I will fix it.)
1749 1749
 */
1750 1750
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1751 1751
                                      long width, long height,
......
1766 1766
}
1767 1767

  
1768 1768
/**
1769
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1770
 * this is a problem for anyone then tell me, and I will fix it).
1769
 * Height should be a multiple of 2 and width should be a multiple of 16.
1770
 * (If this is a problem for anyone then tell me, and I will fix it.)
1771 1771
 */
1772 1772
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1773 1773
                                      long width, long height,
......
2002 2002
}
2003 2003

  
2004 2004
/**
2005
 * Height should be a multiple of 2 and width should be a multiple of 16 (if
2006
 * this is a problem for anyone then tell me, and I will fix it).
2007
 * Chrominance data is only taken from every secound line, others are ignored.
2005
 * Height should be a multiple of 2 and width should be a multiple of 16.
2006
 * (If this is a problem for anyone then tell me, and I will fix it.)
2007
 * Chrominance data is only taken from every second line, others are ignored.
2008 2008
 * FIXME: Write HQ version.
2009 2009
 */
2010 2010
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
......
2128 2128
}
2129 2129

  
2130 2130
/**
2131
 * Height should be a multiple of 2 and width should be a multiple of 2 (if
2132
 * this is a problem for anyone then tell me, and I will fix it).
2133
 * Chrominance data is only taken from every secound line,
2131
 * Height should be a multiple of 2 and width should be a multiple of 2.
2132
 * (If this is a problem for anyone then tell me, and I will fix it.)
2133
 * Chrominance data is only taken from every second line,
2134 2134
 * others are ignored in the C version.
2135 2135
 * FIXME: Write HQ version.
2136 2136
 */
libswscale/swscale_altivec_template.c
245 245
        src_v = vec_mergeh(src_v, (vector signed short)vzero);
246 246

  
247 247
        filter_v = vec_ld(i << 3, filter);
248
        // the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
248
        // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
249 249

  
250
        // the neat trick : we only care for half the elements,
250
        // The neat trick: We only care for half the elements,
251 251
        // high or low depending on (i<<3)%16 (it's 0 or 8 here),
252
        // and we're going to use vec_mule, so we chose
253
        // carefully how to "unpack" the elements into the even slots
252
        // and we're going to use vec_mule, so we choose
253
        // carefully how to "unpack" the elements into the even slots.
254 254
        if ((i << 3) % 16)
255 255
            filter_v = vec_mergel(filter_v, (vector signed short)vzero);
256 256
        else
......
405 405
        return srcSliceH;
406 406
    }
407 407

  
408
    /* this code assume:
408
    /* This code assumes:
409 409

  
410 410
    1) dst is 16 bytes-aligned
411 411
    2) dstStride is a multiple of 16
412 412
    3) width is a multiple of 16
413
    4) lum&chrom stride are multiple of 8
413
    4) lum & chrom stride are multiples of 8
414 414
    */
415 415

  
416 416
    for (y=0; y<height; y++) {
......
482 482
        return srcSliceH;
483 483
    }
484 484

  
485
    /* this code assume:
485
    /* This code assumes:
486 486

  
487 487
    1) dst is 16 bytes-aligned
488 488
    2) dstStride is a multiple of 16
489 489
    3) width is a multiple of 16
490
    4) lum&chrom stride are multiple of 8
490
    4) lum & chrom stride are multiples of 8
491 491
    */
492 492

  
493 493
    for (y=0; y<height; y++) {
libswscale/swscale_bfin.c
1 1
/*
2 2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3 3
 *
4
 * Blackfin Software Video SCALER Operations
4
 * Blackfin software video scaler operations
5 5
 *
6 6
 * This file is part of FFmpeg.
7 7
 *
libswscale/swscale_internal.h
37 37
typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY,
38 38
             int srcSliceH, uint8_t* dst[], int dstStride[]);
39 39

  
40
/* this struct should be aligned on at least 32-byte boundary */
40
/* This struct should be aligned on at least a 32-byte boundary. */
41 41
typedef struct SwsContext{
42 42
    /**
43 43
     * info on struct for av_log
......
73 73
    int16_t *vChrFilter;
74 74
    int16_t *vChrFilterPos;
75 75

  
76
    uint8_t formatConvBuffer[VOF]; //FIXME dynamic alloc, but we have to change a lot of code for this to be useful
76
    uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful
77 77

  
78 78
    int hLumFilterSize;
79 79
    int hChrFilterSize;
......
122 122
#define V_OFFSET              "10*8"
123 123
#define LUM_MMX_FILTER_OFFSET "11*8"
124 124
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"
125
#define DSTW_OFFSET           "11*8+4*4*256*2" //do not change, it is hardcoded in the asm
125
#define DSTW_OFFSET           "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
126 126
#define ESP_OFFSET            "11*8+4*4*256*2+8"
127 127
#define VROUNDER_OFFSET       "11*8+4*4*256*2+16"
128 128
#define U_TEMP                "11*8+4*4*256*2+24"
libswscale/swscale_template.c
17 17
 * along with FFmpeg; if not, write to the Free Software
18 18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 19
 *
20
 * the C code (not assembly, mmx, ...) of this file can be used
21
 * under the LGPL license too
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22 22
 */
23 23

  
24 24
#undef REAL_MOVNTQ
......
30 30
#undef SFENCE
31 31

  
32 32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34 34
#define EMMS     "femms"
35 35
#else
36 36
#define EMMS     "emms"
......
1503 1503
    const int yalpha1=0;
1504 1504
    int i;
1505 1505

  
1506
    uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1506
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1507 1507
    const int yalpha= 4096; //FIXME ...
1508 1508

  
1509 1509
    if (flags&SWS_FULL_CHR_H_INT)
......
1700 1700
    }
1701 1701
}
1702 1702

  
1703
//FIXME yuy2* can read upto 7 samples to much
1703
//FIXME yuy2* can read up to 7 samples too much
1704 1704

  
1705 1705
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1706 1706
{
......
2297 2297
    }
2298 2298
}
2299 2299

  
2300
// Bilinear / Bicubic scaling
2300
// bilinear / bicubic scaling
2301 2301
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2302 2302
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2303 2303
{
......
2544 2544
    }
2545 2545

  
2546 2546
#ifdef HAVE_MMX
2547
    // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2547
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2548 2548
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2549 2549
#else
2550 2550
    if (!(flags&SWS_FAST_BILINEAR))
......
2552 2552
    {
2553 2553
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2554 2554
    }
2555
    else // Fast Bilinear upscale / crap downscale
2555
    else // fast bilinear upscale / crap downscale
2556 2556
    {
2557 2557
#if defined(ARCH_X86)
2558 2558
#ifdef HAVE_MMX2
......
2761 2761
    }
2762 2762

  
2763 2763
#ifdef HAVE_MMX
2764
    // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2765 2765
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2766 2766
#else
2767 2767
    if (!(flags&SWS_FAST_BILINEAR))
......
2770 2770
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771 2771
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2772 2772
    }
2773
    else // Fast Bilinear upscale / crap downscale
2773
    else // fast bilinear upscale / crap downscale
2774 2774
    {
2775 2775
#if defined(ARCH_X86)
2776 2776
#ifdef HAVE_MMX2
......
2890 2890
            "cmp        %2, %%"REG_a"               \n\t"
2891 2891
            " jb        1b                          \n\t"
2892 2892

  
2893
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2894
   which is needed to support GCC-4.0 */
2893
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2894
   which is needed to support GCC 4.0. */
2895 2895
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2896 2896
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2897 2897
#else
......
2963 2963
    int lastDstY;
2964 2964
    uint8_t *pal=NULL;
2965 2965

  
2966
    /* vars whch will change and which we need to storw back in the context */
2966
    /* vars which will change and which we need to store back in the context */
2967 2967
    int dstY= c->dstY;
2968 2968
    int lumBufIndex= c->lumBufIndex;
2969 2969
    int chrBufIndex= c->chrBufIndex;
......
3004 3004
        if (flags & SWS_PRINT_INFO && firstTime)
3005 3005
        {
3006 3006
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3007
                   "         ->cannot do aligned memory acesses anymore\n");
3007
                   "         ->cannot do aligned memory accesses anymore\n");
3008 3008
            firstTime=0;
3009 3009
        }
3010 3010
    }
3011 3011

  
3012
    /* Note the user might start scaling the picture in the middle so this will not get executed
3013
       this is not really intended but works currently, so ppl might do it */
3012
    /* Note the user might start scaling the picture in the middle so this
3013
       will not get executed. This is not really intended but works
3014
       currently, so people might do it. */
3014 3015
    if (srcSliceY ==0){
3015 3016
        lumBufIndex=0;
3016 3017
        chrBufIndex=0;
......
3182 3183
            {
3183 3184
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3184 3185
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3185
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3186
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3186 3187
                {
3187 3188
                    int16_t *lumBuf = lumPixBuf[0];
3188 3189
                    int16_t *chrBuf= chrPixBuf[0];
......
3200 3201
            {
3201 3202
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3202 3203
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3203
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3204
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3204 3205
                {
3205 3206
                    int chrAlpha= vChrFilter[2*dstY+1];
3206 3207
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3207 3208
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3208 3209
                }
3209
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3210
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3210 3211
                {
3211 3212
                    int lumAlpha= vLumFilter[2*dstY+1];
3212 3213
                    int chrAlpha= vChrFilter[2*dstY+1];
......
3217 3218
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3218 3219
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3219 3220
                }
3220
                else //General RGB
3221
                else //general RGB
3221 3222
                {
3222 3223
                    RENAME(yuv2packedX)(c,
3223 3224
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
libswscale/yuv2rgb.c
39 39
#include "swscale.h"
40 40
#include "swscale_internal.h"
41 41

  
42
#define DITHER1XBPP // only for mmx
42
#define DITHER1XBPP // only for MMX
43 43

  
44 44
const uint8_t  __attribute__((aligned(8))) dither_2x2_4[2][8]={
45 45
{  1,   3,   1,   3,   1,   3,   1,   3, },
......
155 155
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
156 156
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
157 157

  
158
// the volatile is required because gcc otherwise optimizes some writes away not knowing that these
159
// are read in the asm block
158
// The volatile is required because gcc otherwise optimizes some writes away
159
// not knowing that these are read in the ASM block.
160 160
static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither;
161 161
static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither;
162 162
static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither;
......
641 641
    }
642 642
#endif
643 643

  
644
    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n");
644
    av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
645 645

  
646 646
    switch(c->dstFormat){
647 647
    case PIX_FMT_BGR32:
libswscale/yuv2rgb_altivec.c
21 21
 */
22 22

  
23 23
/*
24
convert I420 YV12 to RGB in various formats,
25
  it rejects images that are not in 420 formats
26
  it rejects images that don't have widths of multiples of 16
27
  it rejects images that don't have heights of multiples of 2
28
reject defers to C simulation codes.
24
Convert I420 YV12 to RGB in various formats,
25
  it rejects images that are not in 420 formats,
26
  it rejects images that don't have widths of multiples of 16,
27
  it rejects images that don't have heights of multiples of 2.
28
Reject defers to C simulation code.
29 29

  
30
lots of optimizations to be done here
30
Lots of optimizations to be done here.
31 31

  
32
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
33
   so we currently use max min to clip
32
1. Need to fix saturation code. I just couldn't get it to fly with packs
33
   and adds, so we currently use max/min to clip.
34 34

  
35
2. the inefficient use of chroma loading needs a bit of brushing up
35
2. The inefficient use of chroma loading needs a bit of brushing up.
36 36

  
37
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls
37
3. Analysis of pipeline stalls needs to be done. Use shark to identify
38
   pipeline stalls.
38 39

  
39 40

  
40 41
MODIFIED to calculate coeffs from currently selected color space.
41
MODIFIED core to be a macro which you spec the output format.
42
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
42
MODIFIED core to be a macro where you specify the output format.
43
ADDED UYVY conversion which is never called due to some thing in swscale.
43 44
CORRECTED algorithim selection to be strict on input formats.
44
ADDED runtime detection of altivec.
45
ADDED runtime detection of AltiVec.
45 46

  
46 47
ADDED altivec_yuv2packedX vertical scl + RGB converter
47 48

  
48 49
March 27,2004
49 50
PERFORMANCE ANALYSIS
50 51

  
51
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test
52
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence
52
The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
53
used as test.
54
The AltiVec version uses 10% of the processor or ~100Mips for D1 video
55
same sequence.
53 56

  
54
720*480*30  ~10MPS
57
720 * 480 * 30  ~10MPS
55 58

  
56
so we have roughly 10clocks per pixel this is too high something has to be wrong.
59
so we have roughly 10 clocks per pixel. This is too high, something has
60
to be wrong.
57 61

  
58
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min.
62
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
63
need for vec_min.
59 64

  
60
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much
61
guaranteed to have the input video frame it was just decompressed so
62
it probably resides in L1 caches.  However we are creating the
63
output video stream this needs to use the DSTST instruction to
64
optimize for the cache.  We couple this with the fact that we are
65
not going to be visiting the input buffer again so we mark it Least
66
Recently Used.  This shaves 25% of the processor cycles off.
65
OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have
66
the input video frame, it was just decompressed so it probably resides in L1
67
caches. However, we are creating the output video stream. This needs to use the
68
DSTST instruction to optimize for the cache. We couple this with the fact that
69
we are not going to be visiting the input buffer again so we mark it Least
70
Recently Used. This shaves 25% of the processor cycles off.
67 71

  
68
Now MEMCPY is the largest mips consumer in the system, probably due
72
Now memcpy is the largest mips consumer in the system, probably due
69 73
to the inefficient X11 stuff.
70 74

  
71 75
GL libraries seem to be very slow on this machine 1.33Ghz PB running
72 76
Jaguar, this is not the case for my 1Ghz PB.  I thought it might be
73
a versioning issues, however I have libGL.1.2.dylib for both
74
machines. ((We need to figure this out now))
77
a versioning issue, however I have libGL.1.2.dylib for both
78
machines. (We need to figure this out now.)
75 79

  
76
GL2 libraries work now with patch for RGB32
80
GL2 libraries work now with patch for RGB32.
77 81

  
78
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
82
NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
79 83

  
80
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment.
84
Integrated luma prescaling adjustment for saturation/contrast/brightness
85
adjustment.
81 86
*/
82 87

  
83 88
#include <stdio.h>
libswscale/yuv2rgb_bfin.c
1 1
/*
2 2
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
3
 *                    April 20, 2007
4 3
 *
5
 * Blackfin Video Color Space Converters Operations
6
 *  convert I420 YV12 to RGB in various formats,
4
 * Blackfin video color space converter operations
5
 * convert I420 YV12 to RGB in various formats
7 6
 *
8 7
 * This file is part of FFmpeg.
9 8
 *
......
200 199
        return 0;
201 200
    }
202 201

  
203
    av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n",
202
    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
204 203
           sws_format_name (c->dstFormat));
205 204

  
206 205
    return f;
libswscale/yuv2rgb_mlib.c
1 1
/*
2
 * yuv2rgb_mlib.c, Software YUV to RGB converter using mediaLib
2
 * software YUV to RGB converter using mediaLib
3
 *
3 4
 * Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at>
4 5
 *
5 6
 * This file is part of FFmpeg.
libswscale/yuv2rgb_template.c
1 1
/*
2
 * yuv2rgb_mmx.c, Software YUV to RGB converter with Intel MMX "technology"
2
 * yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
3 3
 *
4 4
 * Copyright (C) 2000, Silicon Integrated System Corp.
5 5
 *
......
31 31
#undef SFENCE
32 32

  
33 33
#ifdef HAVE_3DNOW
34
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
35 35
#define EMMS     "femms"
36 36
#else
37 37
#define EMMS     "emms"
......
147 147
        g6Dither= ff_dither4[y&1];
148 148
        g5Dither= ff_dither8[y&1];
149 149
        r5Dither= ff_dither8[(y+1)&1];
150
        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
151
           pixels in each iteration */
150
        /* This MMX assembly code deals with a SINGLE scan line at a time,
151
         * it converts 8 pixels in each iteration. */
152 152
        asm volatile (
153 153
        /* load data for start of next scan line */
154 154
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
......
156 156
        "movq (%5, %0, 2), %%mm6;" /* Load 8  Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
157 157
        //".balign 16     \n\t"
158 158
        "1:             \n\t"
159
        /* no speed diference on my p3@500 with prefetch,
160
         * if it is faster for anyone with -benchmark then tell me
159
        /* No speed difference on my p3@500 with prefetch,
160
         * if it is faster for anyone with -benchmark then tell me.
161 161
        PREFETCH" 64(%0) \n\t"
162 162
        PREFETCH" 64(%1) \n\t"
163 163
        PREFETCH" 64(%2) \n\t"
......
180 180
        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
181 181
        "movq %%mm2, %%mm7;" /* Copy G7-G0 */
182 182

  
183
        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
183
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
184 184
        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
185 185
        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
186 186

  
......
190 190
        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
191 191
        MOVNTQ "      %%mm0, (%1);" /* store pixel 0-3 */
192 192

  
193
        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
193
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
194 194
        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */
195 195
        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
196 196

  
......
242 242
        g6Dither= ff_dither4[y&1];
243 243
        g5Dither= ff_dither8[y&1];
244 244
        r5Dither= ff_dither8[(y+1)&1];
245
        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
246
           pixels in each iteration */
245
        /* This MMX assembly code deals with a SINGLE scan line at a time,
246
         * it converts 8 pixels in each iteration. */
247 247
        asm volatile (
248 248
        /* load data for start of next scan line */
249 249
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
......
271 271
        "movq %%mm0, %%mm5;" /* Copy B7-B0 */
272 272
        "movq %%mm2, %%mm7;" /* Copy G7-G0 */
273 273

  
274
        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
274
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
275 275
        "punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */
276 276
        "punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
277 277

  
......
281 281
        "movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
282 282
        MOVNTQ "      %%mm0, (%1);"  /* store pixel 0-3 */
283 283

  
284
        /* convert rgb24 plane to rgb16 pack for pixel 0-3 */
284
        /* convert RGB24 plane to RGB16 pack for pixel 0-3 */
285 285
        "punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */
286 286
        "punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */
287 287

  
......
326 326
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
327 327
        long index= -h_size/2;
328 328

  
329
        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
330
           pixels in each iteration */
329
        /* This MMX assembly code deals with a SINGLE scan line at a time,
330
         * it converts 8 pixels in each iteration. */
331 331
        asm volatile (
332 332
        /* load data for start of next scan line */
333 333
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */
......
472 472
        uint8_t *pv = src[2] + (y>>1)*srcStride[2];
473 473
        long index= -h_size/2;
474 474

  
475
        /* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
476
           pixels in each iteration */
475
        /* This MMX assembly code deals with a SINGLE scan line at a time,
476
         * it converts 8 pixels in each iteration. */
477 477
        asm volatile (
478 478
        /* load data for start of next scan line */
479 479
        "movd    (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */

Also available in: Unified diff