Revision 8a322796
libswscale/internal_bfin.S | ||
---|---|---|
2 | 2 |
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |
3 | 3 |
* April 20, 2007 |
4 | 4 |
* |
5 |
* Blackfin Video Color Space Converters Operations
|
|
6 |
* convert I420 YV12 to RGB in various formats,
|
|
5 |
* Blackfin video color space converter operations
|
|
6 |
* convert I420 YV12 to RGB in various formats
|
|
7 | 7 |
* |
8 | 8 |
* This file is part of FFmpeg. |
9 | 9 |
* |
... | ... | |
24 | 24 |
|
25 | 25 |
|
26 | 26 |
/* |
27 |
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
|
|
28 |
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts
|
|
27 |
YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock |
|
28 |
and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.
|
|
29 | 29 |
|
30 | 30 |
|
31 | 31 |
The following calculation is used for the conversion: |
... | ... | |
34 | 34 |
g = clipz((y-oy)*cy + cgv*(v-128) + cgu*(u-128)) |
35 | 35 |
b = clipz((y-oy)*cy + cbu*(u-128)) |
36 | 36 |
|
37 |
y,u,v are pre scaled by a factor of 4 i.e. left shifted to gain precision.
|
|
37 |
y,u,v are prescaled by a factor of 4 i.e. left-shifted to gain precision.
|
|
38 | 38 |
|
39 | 39 |
|
40 | 40 |
New factorization to eliminate the truncation error which was |
41 |
occuring due to the byteop3p. |
|
41 |
occurring due to the byteop3p.
|
|
42 | 42 |
|
43 | 43 |
|
44 |
1) use the bytop16m to subtract quad bytes we use this in U8 this
|
|
44 |
1) Use the bytop16m to subtract quad bytes we use this in U8 this
|
|
45 | 45 |
then so the offsets need to be renormalized to 8bits. |
46 | 46 |
|
47 |
2) scale operands up by a factor of 4 not 8 because Blackfin
|
|
47 |
2) Scale operands up by a factor of 4 not 8 because Blackfin
|
|
48 | 48 |
multiplies include a shift. |
49 | 49 |
|
50 |
3) compute into the accumulators cy*yx0, cy*yx1
|
|
50 |
3) Compute into the accumulators cy*yx0, cy*yx1.
|
|
51 | 51 |
|
52 |
4) compute each of the linear equations
|
|
52 |
4) Compute each of the linear equations:
|
|
53 | 53 |
r = clipz((y - oy) * cy + crv * (v - 128)) |
54 | 54 |
|
55 | 55 |
g = clipz((y - oy) * cy + cgv * (v - 128) + cgu * (u - 128)) |
56 | 56 |
|
57 | 57 |
b = clipz((y - oy) * cy + cbu * (u - 128)) |
58 | 58 |
|
59 |
reuse of the accumulators requires that we actually multiply
|
|
60 |
twice once with addition and the second time with a subtaction. |
|
59 |
Reuse of the accumulators requires that we actually multiply
|
|
60 |
twice once with addition and the second time with a subtraction.
|
|
61 | 61 |
|
62 |
because of this we need to compute the equations in the order R B
|
|
62 |
Because of this we need to compute the equations in the order R B
|
|
63 | 63 |
then G saving the writes for B in the case of 24/32 bit color |
64 | 64 |
formats. |
65 | 65 |
|
66 |
api: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
|
66 |
API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
|
|
67 | 67 |
int dW, uint32_t *coeffs); |
68 | 68 |
|
69 | 69 |
A B |
... | ... | |
77 | 77 |
|
78 | 78 |
coeffs is a pointer to oy. |
79 | 79 |
|
80 |
the {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
|
81 |
replication is used to simplify the internal algorithms for the dual mac architecture
|
|
82 |
of BlackFin. |
|
80 |
The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
|
|
81 |
replication is used to simplify the internal algorithms for the dual Mac
|
|
82 |
architecture of BlackFin.
|
|
83 | 83 |
|
84 |
All routines are exported with _ff_bfin_ as a symbol prefix |
|
84 |
All routines are exported with _ff_bfin_ as a symbol prefix.
|
|
85 | 85 |
|
86 |
rough performance gain compared against -O3:
|
|
86 |
Rough performance gain compared against -O3:
|
|
87 | 87 |
|
88 | 88 |
2779809/1484290 187.28% |
89 | 89 |
|
libswscale/rgb2rgb.c | ||
---|---|---|
1 | 1 |
/* |
2 |
* rgb2rgb.c, Software RGB to RGB convertor
|
|
3 |
* pluralize by Software PAL8 to RGB convertor
|
|
4 |
* Software YUV to YUV convertor
|
|
5 |
* Software YUV to RGB convertor
|
|
6 |
* Written by Nick Kurshev.
|
|
7 |
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
|
|
2 |
* software RGB to RGB converter
|
|
3 |
* pluralize by software PAL8 to RGB converter
|
|
4 |
* software YUV to YUV converter
|
|
5 |
* software YUV to RGB converter
|
|
6 |
* Written by Nick Kurshev. |
|
7 |
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) |
|
8 | 8 |
* |
9 | 9 |
* This file is part of FFmpeg. |
10 | 10 |
* |
... | ... | |
22 | 22 |
* along with FFmpeg; if not, write to the Free Software |
23 | 23 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
24 | 24 |
* |
25 |
* the C code (not assembly, mmx, ...) of this file can be used
|
|
26 |
* under the LGPL license too
|
|
25 |
* The C code (not assembly, MMX, ...) of this file can be used
|
|
26 |
* under the LGPL license.
|
|
27 | 27 |
*/ |
28 | 28 |
#include <inttypes.h> |
29 | 29 |
#include "config.h" |
... | ... | |
33 | 33 |
#include "swscale.h" |
34 | 34 |
#include "swscale_internal.h" |
35 | 35 |
|
36 |
#define FAST_BGR2YV12 // use 7 bit coeffs instead of 15bit
|
|
36 |
#define FAST_BGR2YV12 // use 7-bit instead of 15-bit coefficients
|
|
37 | 37 |
|
38 | 38 |
void (*rgb24to32)(const uint8_t *src, uint8_t *dst, long src_size); |
39 | 39 |
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size); |
... | ... | |
149 | 149 |
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5)) |
150 | 150 |
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5)) |
151 | 151 |
|
152 |
//Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
|
|
153 |
//Plain C versions
|
|
152 |
//Note: We have C, MMX, MMX2, 3DNOW versions, there is no 3DNOW + MMX2 one.
|
|
153 |
//plain C versions
|
|
154 | 154 |
#undef HAVE_MMX |
155 | 155 |
#undef HAVE_MMX2 |
156 | 156 |
#undef HAVE_3DNOW |
... | ... | |
190 | 190 |
#endif //ARCH_X86 || ARCH_X86_64 |
191 | 191 |
|
192 | 192 |
/* |
193 |
rgb15->rgb16 Original by Strepto/Astral
|
|
193 |
RGB15->RGB16 original by Strepto/Astral
|
|
194 | 194 |
ported to gcc & bugfixed : A'rpi |
195 | 195 |
MMX2, 3DNOW optimization by Nick Kurshev |
196 |
32bit c version, and and&add trick by Michael Niedermayer
|
|
196 |
32-bit C version, and and&add trick by Michael Niedermayer
|
|
197 | 197 |
*/ |
198 | 198 |
|
199 | 199 |
void sws_rgb2rgb_init(int flags){ |
... | ... | |
266 | 266 |
{ |
267 | 267 |
long i; |
268 | 268 |
/* |
269 |
writes 1 byte o much and might cause alignment issues on some architectures?
|
|
269 |
Writes 1 byte too much and might cause alignment issues on some architectures?
|
|
270 | 270 |
for (i=0; i<num_pixels; i++) |
271 | 271 |
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; |
272 | 272 |
*/ |
... | ... | |
284 | 284 |
{ |
285 | 285 |
long i; |
286 | 286 |
/* |
287 |
writes 1 byte o much and might cause alignment issues on some architectures?
|
|
287 |
Writes 1 byte too much and might cause alignment issues on some architectures?
|
|
288 | 288 |
for (i=0; i<num_pixels; i++) |
289 | 289 |
((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[src[i]]; |
290 | 290 |
*/ |
... | ... | |
299 | 299 |
} |
300 | 300 |
|
301 | 301 |
/** |
302 |
* Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
|
|
302 |
* Palette is assumed to contain BGR16, see rgb32to16 to convert the palette.
|
|
303 | 303 |
*/ |
304 | 304 |
void palette8torgb16(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette) |
305 | 305 |
{ |
libswscale/rgb2rgb.h | ||
---|---|---|
1 | 1 |
/* |
2 |
* rgb2rgb.h, Software RGB to RGB convertor
|
|
3 |
* pluralize by Software PAL8 to RGB convertor
|
|
4 |
* Software YUV to YUV convertor
|
|
5 |
* Software YUV to RGB convertor
|
|
2 |
* software RGB to RGB converter
|
|
3 |
* pluralize by Software PAL8 to RGB converter
|
|
4 |
* Software YUV to YUV converter
|
|
5 |
* Software YUV to RGB converter
|
|
6 | 6 |
* Written by Nick Kurshev. |
7 | 7 |
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) |
8 | 8 |
* |
... | ... | |
28 | 28 |
|
29 | 29 |
#include <inttypes.h> |
30 | 30 |
|
31 |
/* A full collection of rgb to rgb(bgr) convertors */
|
|
31 |
/* A full collection of RGB to RGB(BGR) converters */
|
|
32 | 32 |
extern void (*rgb24to32) (const uint8_t *src, uint8_t *dst, long src_size); |
33 | 33 |
extern void (*rgb24to16) (const uint8_t *src, uint8_t *dst, long src_size); |
34 | 34 |
extern void (*rgb24to15) (const uint8_t *src, uint8_t *dst, long src_size); |
... | ... | |
71 | 71 |
extern void palette8tobgr15(const uint8_t *src, uint8_t *dst, long num_pixels, const uint8_t *palette); |
72 | 72 |
|
73 | 73 |
/** |
74 |
* |
|
75 |
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a
|
|
76 |
* problem for anyone then tell me, and ill fix it)
|
|
77 |
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
|
|
74 |
* Height should be a multiple of 2 and width should be a multiple of 16.
|
|
75 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
76 |
* Chrominance data is only taken from every second line, others are ignored.
|
|
77 |
* FIXME: Write HQ version.
|
|
78 | 78 |
*/ |
79 | 79 |
//void uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
80 | 80 |
|
81 | 81 |
/** |
82 |
* |
|
83 |
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
|
84 |
* problem for anyone then tell me, and ill fix it) |
|
82 |
* Height should be a multiple of 2 and width should be a multiple of 16. |
|
83 |
* (If this is a problem for anyone then tell me, and I will fix it.) |
|
85 | 84 |
*/ |
86 | 85 |
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
87 | 86 |
long width, long height, |
88 | 87 |
long lumStride, long chromStride, long dstStride); |
89 | 88 |
|
90 | 89 |
/** |
91 |
* |
|
92 |
* width should be a multiple of 16 |
|
90 |
* Width should be a multiple of 16. |
|
93 | 91 |
*/ |
94 | 92 |
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
95 | 93 |
long width, long height, |
96 | 94 |
long lumStride, long chromStride, long dstStride); |
97 | 95 |
|
98 | 96 |
/** |
99 |
* |
|
100 |
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
|
101 |
* problem for anyone then tell me, and ill fix it) |
|
97 |
* Height should be a multiple of 2 and width should be a multiple of 16. |
|
98 |
* (If this is a problem for anyone then tell me, and I will fix it.) |
|
102 | 99 |
*/ |
103 | 100 |
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
104 | 101 |
long width, long height, |
105 | 102 |
long lumStride, long chromStride, long srcStride); |
106 | 103 |
|
107 | 104 |
/** |
108 |
* |
|
109 |
* height should be a multiple of 2 and width should be a multiple of 16 (if this is a |
|
110 |
* problem for anyone then tell me, and ill fix it) |
|
105 |
* Height should be a multiple of 2 and width should be a multiple of 16. |
|
106 |
* (If this is a problem for anyone then tell me, and I will fix it.) |
|
111 | 107 |
*/ |
112 | 108 |
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
113 | 109 |
long width, long height, |
114 | 110 |
long lumStride, long chromStride, long dstStride); |
115 | 111 |
|
116 | 112 |
/** |
117 |
* |
|
118 |
* height should be a multiple of 2 and width should be a multiple of 2 (if this is a
|
|
119 |
* problem for anyone then tell me, and ill fix it)
|
|
120 |
* chrominance data is only taken from every secound line others are ignored FIXME write HQ version
|
|
113 |
* Height should be a multiple of 2 and width should be a multiple of 2.
|
|
114 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
115 |
* Chrominance data is only taken from every second line, others are ignored.
|
|
116 |
* FIXME: Write HQ version.
|
|
121 | 117 |
*/ |
122 | 118 |
extern void (*rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
123 | 119 |
long width, long height, |
libswscale/rgb2rgb_template.c | ||
---|---|---|
1 | 1 |
/* |
2 |
* rgb2rgb.c, Software RGB to RGB convertor
|
|
3 |
* pluralize by Software PAL8 to RGB convertor
|
|
4 |
* Software YUV to YUV convertor
|
|
5 |
* Software YUV to RGB convertor
|
|
6 |
* Written by Nick Kurshev.
|
|
7 |
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
|
|
8 |
* lot of big-endian byteorder fixes by Alex Beregszaszi
|
|
2 |
* software RGB to RGB converter
|
|
3 |
* pluralize by software PAL8 to RGB converter
|
|
4 |
* software YUV to YUV converter
|
|
5 |
* software YUV to RGB converter
|
|
6 |
* Written by Nick Kurshev. |
|
7 |
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) |
|
8 |
* lot of big-endian byte order fixes by Alex Beregszaszi
|
|
9 | 9 |
* |
10 | 10 |
* This file is part of FFmpeg. |
11 | 11 |
* |
... | ... | |
23 | 23 |
* along with FFmpeg; if not, write to the Free Software |
24 | 24 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
25 | 25 |
* |
26 |
* The C code (not assembly, mmx, ...) of this file can be used
|
|
26 |
* The C code (not assembly, MMX, ...) of this file can be used
|
|
27 | 27 |
* under the LGPL license. |
28 | 28 |
*/ |
29 | 29 |
|
... | ... | |
229 | 229 |
} |
230 | 230 |
|
231 | 231 |
/* |
232 |
Original by Strepto/Astral
|
|
233 |
ported to gcc & bugfixed : A'rpi
|
|
232 |
original by Strepto/Astral
|
|
233 |
ported to gcc & bugfixed: A'rpi |
|
234 | 234 |
MMX2, 3DNOW optimization by Nick Kurshev |
235 |
32 bit C version, and and&add trick by Michael Niedermayer
|
|
235 |
32-bit C version, and and&add trick by Michael Niedermayer
|
|
236 | 236 |
*/ |
237 | 237 |
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size) |
238 | 238 |
{ |
... | ... | |
926 | 926 |
---------------- |
927 | 927 |
1 1 0 1 1 1 1 0 |
928 | 928 |
|=======| |===| |
929 |
| Leftmost Bits Repeated to Fill Open Bits
|
|
929 |
| leftmost bits repeated to fill open bits
|
|
930 | 930 |
| |
931 |
Original Bits
|
|
931 |
original bits
|
|
932 | 932 |
*/ |
933 | 933 |
static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size) |
934 | 934 |
{ |
... | ... | |
1006 | 1006 |
:"=m"(*d) |
1007 | 1007 |
:"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) |
1008 | 1008 |
:"memory"); |
1009 |
/* Borrowed 32 to 24 */
|
|
1009 |
/* borrowed 32 to 24 */
|
|
1010 | 1010 |
asm volatile( |
1011 | 1011 |
"movq %%mm0, %%mm4 \n\t" |
1012 | 1012 |
"movq %%mm3, %%mm5 \n\t" |
... | ... | |
1147 | 1147 |
:"=m"(*d) |
1148 | 1148 |
:"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) |
1149 | 1149 |
:"memory"); |
1150 |
/* Borrowed 32 to 24 */
|
|
1150 |
/* borrowed 32 to 24 */
|
|
1151 | 1151 |
asm volatile( |
1152 | 1152 |
"movq %%mm0, %%mm4 \n\t" |
1153 | 1153 |
"movq %%mm3, %%mm5 \n\t" |
... | ... | |
1479 | 1479 |
asm volatile(SFENCE:::"memory"); |
1480 | 1480 |
asm volatile(EMMS:::"memory"); |
1481 | 1481 |
|
1482 |
if (mmx_size==23) return; //finihsed, was multiple of 8
|
|
1482 |
if (mmx_size==23) return; //finished, was multiple of 8
|
|
1483 | 1483 |
|
1484 | 1484 |
src+= src_size; |
1485 | 1485 |
dst+= src_size; |
... | ... | |
1638 | 1638 |
} |
1639 | 1639 |
|
1640 | 1640 |
/** |
1641 |
* Height should be a multiple of 2 and width should be a multiple of 16 (if
|
|
1642 |
* this is a problem for anyone then tell me, and I will fix it).
|
|
1641 |
* Height should be a multiple of 2 and width should be a multiple of 16.
|
|
1642 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
1643 | 1643 |
*/ |
1644 | 1644 |
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1645 | 1645 |
long width, long height, |
... | ... | |
1720 | 1720 |
(vc[0] << 8) + (yc[1] << 0); |
1721 | 1721 |
#else |
1722 | 1722 |
*idst++ = uc[0] + (yc[0] << 8) + |
1723 |
(vc[0] << 16) + (yc[1] << 24);
|
|
1723 |
(vc[0] << 16) + (yc[1] << 24); |
|
1724 | 1724 |
#endif |
1725 | 1725 |
yc += 2; |
1726 | 1726 |
uc++; |
... | ... | |
1744 | 1744 |
} |
1745 | 1745 |
|
1746 | 1746 |
/** |
1747 |
* Height should be a multiple of 2 and width should be a multiple of 16 (if
|
|
1748 |
* this is a problem for anyone then tell me, and I will fix it).
|
|
1747 |
* Height should be a multiple of 2 and width should be a multiple of 16 |
|
1748 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
1749 | 1749 |
*/ |
1750 | 1750 |
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, |
1751 | 1751 |
long width, long height, |
... | ... | |
1766 | 1766 |
} |
1767 | 1767 |
|
1768 | 1768 |
/** |
1769 |
* Height should be a multiple of 2 and width should be a multiple of 16 (if
|
|
1770 |
* this is a problem for anyone then tell me, and I will fix it).
|
|
1769 |
* Height should be a multiple of 2 and width should be a multiple of 16.
|
|
1770 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
1771 | 1771 |
*/ |
1772 | 1772 |
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
1773 | 1773 |
long width, long height, |
... | ... | |
2002 | 2002 |
} |
2003 | 2003 |
|
2004 | 2004 |
/** |
2005 |
* Height should be a multiple of 2 and width should be a multiple of 16 (if
|
|
2006 |
* this is a problem for anyone then tell me, and I will fix it).
|
|
2007 |
* Chrominance data is only taken from every secound line, others are ignored.
|
|
2005 |
* Height should be a multiple of 2 and width should be a multiple of 16.
|
|
2006 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
2007 |
* Chrominance data is only taken from every second line, others are ignored. |
|
2008 | 2008 |
* FIXME: Write HQ version. |
2009 | 2009 |
*/ |
2010 | 2010 |
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, |
... | ... | |
2128 | 2128 |
} |
2129 | 2129 |
|
2130 | 2130 |
/** |
2131 |
* Height should be a multiple of 2 and width should be a multiple of 2 (if
|
|
2132 |
* this is a problem for anyone then tell me, and I will fix it).
|
|
2133 |
* Chrominance data is only taken from every secound line,
|
|
2131 |
* Height should be a multiple of 2 and width should be a multiple of 2.
|
|
2132 |
* (If this is a problem for anyone then tell me, and I will fix it.)
|
|
2133 |
* Chrominance data is only taken from every second line, |
|
2134 | 2134 |
* others are ignored in the C version. |
2135 | 2135 |
* FIXME: Write HQ version. |
2136 | 2136 |
*/ |
libswscale/swscale_altivec_template.c | ||
---|---|---|
245 | 245 |
src_v = vec_mergeh(src_v, (vector signed short)vzero); |
246 | 246 |
|
247 | 247 |
filter_v = vec_ld(i << 3, filter); |
248 |
// the 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2)
|
|
248 |
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
|
|
249 | 249 |
|
250 |
// the neat trick : we only care for half the elements,
|
|
250 |
// The neat trick: We only care for half the elements,
|
|
251 | 251 |
// high or low depending on (i<<3)%16 (it's 0 or 8 here), |
252 |
// and we're going to use vec_mule, so we chose |
|
253 |
// carefully how to "unpack" the elements into the even slots |
|
252 |
// and we're going to use vec_mule, so we choose
|
|
253 |
// carefully how to "unpack" the elements into the even slots.
|
|
254 | 254 |
if ((i << 3) % 16) |
255 | 255 |
filter_v = vec_mergel(filter_v, (vector signed short)vzero); |
256 | 256 |
else |
... | ... | |
405 | 405 |
return srcSliceH; |
406 | 406 |
} |
407 | 407 |
|
408 |
/* this code assume:
|
|
408 |
/* This code assumes:
|
|
409 | 409 |
|
410 | 410 |
1) dst is 16 bytes-aligned |
411 | 411 |
2) dstStride is a multiple of 16 |
412 | 412 |
3) width is a multiple of 16 |
413 |
4) lum&chrom stride are multiple of 8
|
|
413 |
4) lum & chrom stride are multiples of 8
|
|
414 | 414 |
*/ |
415 | 415 |
|
416 | 416 |
for (y=0; y<height; y++) { |
... | ... | |
482 | 482 |
return srcSliceH; |
483 | 483 |
} |
484 | 484 |
|
485 |
/* this code assume:
|
|
485 |
/* This code assumes:
|
|
486 | 486 |
|
487 | 487 |
1) dst is 16 bytes-aligned |
488 | 488 |
2) dstStride is a multiple of 16 |
489 | 489 |
3) width is a multiple of 16 |
490 |
4) lum&chrom stride are multiple of 8
|
|
490 |
4) lum & chrom stride are multiples of 8
|
|
491 | 491 |
*/ |
492 | 492 |
|
493 | 493 |
for (y=0; y<height; y++) { |
libswscale/swscale_bfin.c | ||
---|---|---|
1 | 1 |
/* |
2 | 2 |
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |
3 | 3 |
* |
4 |
* Blackfin Software Video SCALER Operations
|
|
4 |
* Blackfin software video scaler operations
|
|
5 | 5 |
* |
6 | 6 |
* This file is part of FFmpeg. |
7 | 7 |
* |
libswscale/swscale_internal.h | ||
---|---|---|
37 | 37 |
typedef int (*SwsFunc)(struct SwsContext *context, uint8_t* src[], int srcStride[], int srcSliceY, |
38 | 38 |
int srcSliceH, uint8_t* dst[], int dstStride[]); |
39 | 39 |
|
40 |
/* this struct should be aligned on at least 32-byte boundary */
|
|
40 |
/* This struct should be aligned on at least a 32-byte boundary. */
|
|
41 | 41 |
typedef struct SwsContext{ |
42 | 42 |
/** |
43 | 43 |
* info on struct for av_log |
... | ... | |
73 | 73 |
int16_t *vChrFilter; |
74 | 74 |
int16_t *vChrFilterPos; |
75 | 75 |
|
76 |
uint8_t formatConvBuffer[VOF]; //FIXME dynamic alloc, but we have to change a lot of code for this to be useful |
|
76 |
uint8_t formatConvBuffer[VOF]; //FIXME dynamic allocation, but we have to change a lot of code for this to be useful
|
|
77 | 77 |
|
78 | 78 |
int hLumFilterSize; |
79 | 79 |
int hChrFilterSize; |
... | ... | |
122 | 122 |
#define V_OFFSET "10*8" |
123 | 123 |
#define LUM_MMX_FILTER_OFFSET "11*8" |
124 | 124 |
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256" |
125 |
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the asm
|
|
125 |
#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM
|
|
126 | 126 |
#define ESP_OFFSET "11*8+4*4*256*2+8" |
127 | 127 |
#define VROUNDER_OFFSET "11*8+4*4*256*2+16" |
128 | 128 |
#define U_TEMP "11*8+4*4*256*2+24" |
libswscale/swscale_template.c | ||
---|---|---|
17 | 17 |
* along with FFmpeg; if not, write to the Free Software |
18 | 18 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
19 | 19 |
* |
20 |
* the C code (not assembly, mmx, ...) of this file can be used
|
|
21 |
* under the LGPL license too
|
|
20 |
* The C code (not assembly, MMX, ...) of this file can be used
|
|
21 |
* under the LGPL license.
|
|
22 | 22 |
*/ |
23 | 23 |
|
24 | 24 |
#undef REAL_MOVNTQ |
... | ... | |
30 | 30 |
#undef SFENCE |
31 | 31 |
|
32 | 32 |
#ifdef HAVE_3DNOW |
33 |
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
|
|
33 |
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
|
|
34 | 34 |
#define EMMS "femms" |
35 | 35 |
#else |
36 | 36 |
#define EMMS "emms" |
... | ... | |
1503 | 1503 |
const int yalpha1=0; |
1504 | 1504 |
int i; |
1505 | 1505 |
|
1506 |
uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
|
|
1506 |
uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
|
|
1507 | 1507 |
const int yalpha= 4096; //FIXME ... |
1508 | 1508 |
|
1509 | 1509 |
if (flags&SWS_FULL_CHR_H_INT) |
... | ... | |
1700 | 1700 |
} |
1701 | 1701 |
} |
1702 | 1702 |
|
1703 |
//FIXME yuy2* can read upto 7 samples to much
|
|
1703 |
//FIXME yuy2* can read up to 7 samples too much
|
|
1704 | 1704 |
|
1705 | 1705 |
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width) |
1706 | 1706 |
{ |
... | ... | |
2297 | 2297 |
} |
2298 | 2298 |
} |
2299 | 2299 |
|
2300 |
// Bilinear / Bicubic scaling
|
|
2300 |
// bilinear / bicubic scaling
|
|
2301 | 2301 |
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc, |
2302 | 2302 |
int16_t *filter, int16_t *filterPos, long filterSize) |
2303 | 2303 |
{ |
... | ... | |
2544 | 2544 |
} |
2545 | 2545 |
|
2546 | 2546 |
#ifdef HAVE_MMX |
2547 |
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
|
|
2547 |
// Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
|
|
2548 | 2548 |
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
2549 | 2549 |
#else |
2550 | 2550 |
if (!(flags&SWS_FAST_BILINEAR)) |
... | ... | |
2552 | 2552 |
{ |
2553 | 2553 |
RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize); |
2554 | 2554 |
} |
2555 |
else // Fast Bilinear upscale / crap downscale
|
|
2555 |
else // fast bilinear upscale / crap downscale
|
|
2556 | 2556 |
{ |
2557 | 2557 |
#if defined(ARCH_X86) |
2558 | 2558 |
#ifdef HAVE_MMX2 |
... | ... | |
2761 | 2761 |
} |
2762 | 2762 |
|
2763 | 2763 |
#ifdef HAVE_MMX |
2764 |
// use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
|
|
2764 |
// Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
|
|
2765 | 2765 |
if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed)) |
2766 | 2766 |
#else |
2767 | 2767 |
if (!(flags&SWS_FAST_BILINEAR)) |
... | ... | |
2770 | 2770 |
RENAME(hScale)(dst , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
2771 | 2771 |
RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize); |
2772 | 2772 |
} |
2773 |
else // Fast Bilinear upscale / crap downscale
|
|
2773 |
else // fast bilinear upscale / crap downscale
|
|
2774 | 2774 |
{ |
2775 | 2775 |
#if defined(ARCH_X86) |
2776 | 2776 |
#ifdef HAVE_MMX2 |
... | ... | |
2890 | 2890 |
"cmp %2, %%"REG_a" \n\t" |
2891 | 2891 |
" jb 1b \n\t" |
2892 | 2892 |
|
2893 |
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
|
|
2894 |
which is needed to support GCC-4.0 */
|
|
2893 |
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
|
|
2894 |
which is needed to support GCC 4.0. */
|
|
2895 | 2895 |
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) |
2896 | 2896 |
:: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask), |
2897 | 2897 |
#else |
... | ... | |
2963 | 2963 |
int lastDstY; |
2964 | 2964 |
uint8_t *pal=NULL; |
2965 | 2965 |
|
2966 |
/* vars whch will change and which we need to storw back in the context */
|
|
2966 |
/* vars which will change and which we need to store back in the context */
|
|
2967 | 2967 |
int dstY= c->dstY; |
2968 | 2968 |
int lumBufIndex= c->lumBufIndex; |
2969 | 2969 |
int chrBufIndex= c->chrBufIndex; |
... | ... | |
3004 | 3004 |
if (flags & SWS_PRINT_INFO && firstTime) |
3005 | 3005 |
{ |
3006 | 3006 |
av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n" |
3007 |
" ->cannot do aligned memory acesses anymore\n"); |
|
3007 |
" ->cannot do aligned memory accesses anymore\n");
|
|
3008 | 3008 |
firstTime=0; |
3009 | 3009 |
} |
3010 | 3010 |
} |
3011 | 3011 |
|
3012 |
/* Note the user might start scaling the picture in the middle so this will not get executed |
|
3013 |
this is not really intended but works currently, so ppl might do it */ |
|
3012 |
/* Note the user might start scaling the picture in the middle so this |
|
3013 |
will not get executed. This is not really intended but works |
|
3014 |
currently, so people might do it. */ |
|
3014 | 3015 |
if (srcSliceY ==0){ |
3015 | 3016 |
lumBufIndex=0; |
3016 | 3017 |
chrBufIndex=0; |
... | ... | |
3182 | 3183 |
{ |
3183 | 3184 |
const int chrSkipMask= (1<<c->chrDstVSubSample)-1; |
3184 | 3185 |
if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi |
3185 |
if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
|
|
3186 |
if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
|
|
3186 | 3187 |
{ |
3187 | 3188 |
int16_t *lumBuf = lumPixBuf[0]; |
3188 | 3189 |
int16_t *chrBuf= chrPixBuf[0]; |
... | ... | |
3200 | 3201 |
{ |
3201 | 3202 |
ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2); |
3202 | 3203 |
ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2); |
3203 |
if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
|
|
3204 |
if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
|
|
3204 | 3205 |
{ |
3205 | 3206 |
int chrAlpha= vChrFilter[2*dstY+1]; |
3206 | 3207 |
RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1), |
3207 | 3208 |
dest, dstW, chrAlpha, dstFormat, flags, dstY); |
3208 | 3209 |
} |
3209 |
else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
|
|
3210 |
else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
|
|
3210 | 3211 |
{ |
3211 | 3212 |
int lumAlpha= vLumFilter[2*dstY+1]; |
3212 | 3213 |
int chrAlpha= vChrFilter[2*dstY+1]; |
... | ... | |
3217 | 3218 |
RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1), |
3218 | 3219 |
dest, dstW, lumAlpha, chrAlpha, dstY); |
3219 | 3220 |
} |
3220 |
else //General RGB
|
|
3221 |
else //general RGB
|
|
3221 | 3222 |
{ |
3222 | 3223 |
RENAME(yuv2packedX)(c, |
3223 | 3224 |
vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize, |
libswscale/yuv2rgb.c | ||
---|---|---|
39 | 39 |
#include "swscale.h" |
40 | 40 |
#include "swscale_internal.h" |
41 | 41 |
|
42 |
#define DITHER1XBPP // only for mmx
|
|
42 |
#define DITHER1XBPP // only for MMX
|
|
43 | 43 |
|
44 | 44 |
const uint8_t __attribute__((aligned(8))) dither_2x2_4[2][8]={ |
45 | 45 |
{ 1, 3, 1, 3, 1, 3, 1, 3, }, |
... | ... | |
155 | 155 |
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL; |
156 | 156 |
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL; |
157 | 157 |
|
158 |
// the volatile is required because gcc otherwise optimizes some writes away not knowing that these
|
|
159 |
// are read in the asm block
|
|
158 |
// The volatile is required because gcc otherwise optimizes some writes away
|
|
159 |
// not knowing that these are read in the ASM block.
|
|
160 | 160 |
static volatile uint64_t attribute_used __attribute__((aligned(8))) b5Dither; |
161 | 161 |
static volatile uint64_t attribute_used __attribute__((aligned(8))) g5Dither; |
162 | 162 |
static volatile uint64_t attribute_used __attribute__((aligned(8))) g6Dither; |
... | ... | |
641 | 641 |
} |
642 | 642 |
#endif |
643 | 643 |
|
644 |
av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found\n"); |
|
644 |
av_log(c, AV_LOG_WARNING, "No accelerated colorspace conversion found.\n");
|
|
645 | 645 |
|
646 | 646 |
switch(c->dstFormat){ |
647 | 647 |
case PIX_FMT_BGR32: |
libswscale/yuv2rgb_altivec.c | ||
---|---|---|
21 | 21 |
*/ |
22 | 22 |
|
23 | 23 |
/* |
24 |
convert I420 YV12 to RGB in various formats,
|
|
25 |
it rejects images that are not in 420 formats |
|
26 |
it rejects images that don't have widths of multiples of 16 |
|
27 |
it rejects images that don't have heights of multiples of 2 |
|
28 |
reject defers to C simulation codes.
|
|
24 |
Convert I420 YV12 to RGB in various formats,
|
|
25 |
it rejects images that are not in 420 formats,
|
|
26 |
it rejects images that don't have widths of multiples of 16,
|
|
27 |
it rejects images that don't have heights of multiples of 2.
|
|
28 |
Reject defers to C simulation code.
|
|
29 | 29 |
|
30 |
lots of optimizations to be done here
|
|
30 |
Lots of optimizations to be done here.
|
|
31 | 31 |
|
32 |
1. need to fix saturation code, I just couldn't get it to fly with packs and adds.
|
|
33 |
so we currently use max min to clip
|
|
32 |
1. Need to fix saturation code. I just couldn't get it to fly with packs
|
|
33 |
and adds, so we currently use max/min to clip.
|
|
34 | 34 |
|
35 |
2. the inefficient use of chroma loading needs a bit of brushing up
|
|
35 |
2. The inefficient use of chroma loading needs a bit of brushing up.
|
|
36 | 36 |
|
37 |
3. analysis of pipeline stalls needs to be done, use shark to identify pipeline stalls |
|
37 |
3. Analysis of pipeline stalls needs to be done. Use shark to identify |
|
38 |
pipeline stalls. |
|
38 | 39 |
|
39 | 40 |
|
40 | 41 |
MODIFIED to calculate coeffs from currently selected color space. |
41 |
MODIFIED core to be a macro which you spec the output format.
|
|
42 |
ADDED UYVY conversion which is never called due to some thing in SWSCALE.
|
|
42 |
MODIFIED core to be a macro where you specify the output format.
|
|
43 |
ADDED UYVY conversion which is never called due to some thing in swscale.
|
|
43 | 44 |
CORRECTED algorithim selection to be strict on input formats. |
44 |
ADDED runtime detection of altivec.
|
|
45 |
ADDED runtime detection of AltiVec.
|
|
45 | 46 |
|
46 | 47 |
ADDED altivec_yuv2packedX vertical scl + RGB converter |
47 | 48 |
|
48 | 49 |
March 27,2004 |
49 | 50 |
PERFORMANCE ANALYSIS |
50 | 51 |
|
51 |
The C version use 25% of the processor or ~250Mips for D1 video rawvideo used as test |
|
52 |
The ALTIVEC version uses 10% of the processor or ~100Mips for D1 video same sequence |
|
52 |
The C version uses 25% of the processor or ~250Mips for D1 video rawvideo |
|
53 |
used as test. |
|
54 |
The AltiVec version uses 10% of the processor or ~100Mips for D1 video |
|
55 |
same sequence. |
|
53 | 56 |
|
54 |
720*480*30 ~10MPS
|
|
57 |
720 * 480 * 30 ~10MPS
|
|
55 | 58 |
|
56 |
so we have roughly 10clocks per pixel this is too high something has to be wrong. |
|
59 |
so we have roughly 10 clocks per pixel. This is too high, something has |
|
60 |
to be wrong. |
|
57 | 61 |
|
58 |
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the need for vec_min. |
|
62 |
OPTIMIZED clip codes to utilize vec_max and vec_packs removing the |
|
63 |
need for vec_min. |
|
59 | 64 |
|
60 |
OPTIMIZED DST OUTPUT cache/dma controls. we are pretty much |
|
61 |
guaranteed to have the input video frame it was just decompressed so |
|
62 |
it probably resides in L1 caches. However we are creating the |
|
63 |
output video stream this needs to use the DSTST instruction to |
|
64 |
optimize for the cache. We couple this with the fact that we are |
|
65 |
not going to be visiting the input buffer again so we mark it Least |
|
66 |
Recently Used. This shaves 25% of the processor cycles off. |
|
65 |
OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to have |
|
66 |
the input video frame, it was just decompressed so it probably resides in L1 |
|
67 |
caches. However, we are creating the output video stream. This needs to use the |
|
68 |
DSTST instruction to optimize for the cache. We couple this with the fact that |
|
69 |
we are not going to be visiting the input buffer again so we mark it Least |
|
70 |
Recently Used. This shaves 25% of the processor cycles off. |
|
67 | 71 |
|
68 |
Now MEMCPY is the largest mips consumer in the system, probably due
|
|
72 |
Now memcpy is the largest mips consumer in the system, probably due
|
|
69 | 73 |
to the inefficient X11 stuff. |
70 | 74 |
|
71 | 75 |
GL libraries seem to be very slow on this machine 1.33Ghz PB running |
72 | 76 |
Jaguar, this is not the case for my 1Ghz PB. I thought it might be |
73 |
a versioning issues, however I have libGL.1.2.dylib for both
|
|
74 |
machines. ((We need to figure this out now))
|
|
77 |
a versioning issue, however I have libGL.1.2.dylib for both |
|
78 |
machines. (We need to figure this out now.)
|
|
75 | 79 |
|
76 |
GL2 libraries work now with patch for RGB32 |
|
80 |
GL2 libraries work now with patch for RGB32.
|
|
77 | 81 |
|
78 |
NOTE quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor
|
|
82 |
NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
|
|
79 | 83 |
|
80 |
Integrated luma prescaling adjustment for saturation/contrast/brightness adjustment. |
|
84 |
Integrated luma prescaling adjustment for saturation/contrast/brightness |
|
85 |
adjustment. |
|
81 | 86 |
*/ |
82 | 87 |
|
83 | 88 |
#include <stdio.h> |
libswscale/yuv2rgb_bfin.c | ||
---|---|---|
1 | 1 |
/* |
2 | 2 |
* Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com> |
3 |
* April 20, 2007 |
|
4 | 3 |
* |
5 |
* Blackfin Video Color Space Converters Operations
|
|
6 |
* convert I420 YV12 to RGB in various formats,
|
|
4 |
* Blackfin video color space converter operations
|
|
5 |
* convert I420 YV12 to RGB in various formats
|
|
7 | 6 |
* |
8 | 7 |
* This file is part of FFmpeg. |
9 | 8 |
* |
... | ... | |
200 | 199 |
return 0; |
201 | 200 |
} |
202 | 201 |
|
203 |
av_log(c, AV_LOG_INFO, "BlackFin Accelerated Color Space Converter %s\n",
|
|
202 |
av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
|
|
204 | 203 |
sws_format_name (c->dstFormat)); |
205 | 204 |
|
206 | 205 |
return f; |
libswscale/yuv2rgb_mlib.c | ||
---|---|---|
1 | 1 |
/* |
2 |
* yuv2rgb_mlib.c, Software YUV to RGB converter using mediaLib |
|
2 |
* software YUV to RGB converter using mediaLib |
|
3 |
* |
|
3 | 4 |
* Copyright (C) 2003 Michael Niedermayer <michaelni@gmx.at> |
4 | 5 |
* |
5 | 6 |
* This file is part of FFmpeg. |
libswscale/yuv2rgb_template.c | ||
---|---|---|
1 | 1 |
/* |
2 |
* yuv2rgb_mmx.c, Software YUV to RGB converter with Intel MMX "technology"
|
|
2 |
* yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
|
|
3 | 3 |
* |
4 | 4 |
* Copyright (C) 2000, Silicon Integrated System Corp. |
5 | 5 |
* |
... | ... | |
31 | 31 |
#undef SFENCE |
32 | 32 |
|
33 | 33 |
#ifdef HAVE_3DNOW |
34 |
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
|
|
34 |
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
|
|
35 | 35 |
#define EMMS "femms" |
36 | 36 |
#else |
37 | 37 |
#define EMMS "emms" |
... | ... | |
147 | 147 |
g6Dither= ff_dither4[y&1]; |
148 | 148 |
g5Dither= ff_dither8[y&1]; |
149 | 149 |
r5Dither= ff_dither8[(y+1)&1]; |
150 |
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
151 |
pixels in each iteration */
|
|
150 |
/* This MMX assembly code deals with a SINGLE scan line at a time,
|
|
151 |
* it converts 8 pixels in each iteration. */
|
|
152 | 152 |
asm volatile ( |
153 | 153 |
/* load data for start of next scan line */ |
154 | 154 |
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
... | ... | |
156 | 156 |
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
157 | 157 |
//".balign 16 \n\t" |
158 | 158 |
"1: \n\t" |
159 |
/* no speed diference on my p3@500 with prefetch,
|
|
160 |
* if it is faster for anyone with -benchmark then tell me |
|
159 |
/* No speed difference on my p3@500 with prefetch,
|
|
160 |
* if it is faster for anyone with -benchmark then tell me.
|
|
161 | 161 |
PREFETCH" 64(%0) \n\t" |
162 | 162 |
PREFETCH" 64(%1) \n\t" |
163 | 163 |
PREFETCH" 64(%2) \n\t" |
... | ... | |
180 | 180 |
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
181 | 181 |
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
182 | 182 |
|
183 |
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
|
183 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
|
184 | 184 |
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
185 | 185 |
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
186 | 186 |
|
... | ... | |
190 | 190 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
191 | 191 |
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ |
192 | 192 |
|
193 |
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
|
193 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
|
194 | 194 |
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
195 | 195 |
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
196 | 196 |
|
... | ... | |
242 | 242 |
g6Dither= ff_dither4[y&1]; |
243 | 243 |
g5Dither= ff_dither8[y&1]; |
244 | 244 |
r5Dither= ff_dither8[(y+1)&1]; |
245 |
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
246 |
pixels in each iteration */
|
|
245 |
/* This MMX assembly code deals with a SINGLE scan line at a time,
|
|
246 |
* it converts 8 pixels in each iteration. */
|
|
247 | 247 |
asm volatile ( |
248 | 248 |
/* load data for start of next scan line */ |
249 | 249 |
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
... | ... | |
271 | 271 |
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
272 | 272 |
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
273 | 273 |
|
274 |
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
|
274 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
|
275 | 275 |
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ |
276 | 276 |
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
277 | 277 |
|
... | ... | |
281 | 281 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
282 | 282 |
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ |
283 | 283 |
|
284 |
/* convert rgb24 plane to rgb16 pack for pixel 0-3 */
|
|
284 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
|
285 | 285 |
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ |
286 | 286 |
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
287 | 287 |
|
... | ... | |
326 | 326 |
uint8_t *pv = src[2] + (y>>1)*srcStride[2]; |
327 | 327 |
long index= -h_size/2; |
328 | 328 |
|
329 |
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
330 |
pixels in each iteration */
|
|
329 |
/* This MMX assembly code deals with a SINGLE scan line at a time,
|
|
330 |
* it converts 8 pixels in each iteration. */
|
|
331 | 331 |
asm volatile ( |
332 | 332 |
/* load data for start of next scan line */ |
333 | 333 |
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
... | ... | |
472 | 472 |
uint8_t *pv = src[2] + (y>>1)*srcStride[2]; |
473 | 473 |
long index= -h_size/2; |
474 | 474 |
|
475 |
/* this mmx assembly code deals with SINGLE scan line at a time, it convert 8
|
|
476 |
pixels in each iteration */
|
|
475 |
/* This MMX assembly code deals with a SINGLE scan line at a time,
|
|
476 |
* it converts 8 pixels in each iteration. */
|
|
477 | 477 |
asm volatile ( |
478 | 478 |
/* load data for start of next scan line */ |
479 | 479 |
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
Also available in: Unified diff