ffmpeg / libswscale / x86 / yuv2rgb_template.c @ 845e37e7
History | View | Annotate | Download (22.2 KB)
1 |
/*
|
---|---|
2 |
* yuv2rgb_mmx.c, software YUV to RGB converter with Intel MMX "technology"
|
3 |
*
|
4 |
* Copyright (C) 2000, Silicon Integrated System Corp
|
5 |
*
|
6 |
* Author: Olie Lho <ollie@sis.com.tw>
|
7 |
*
|
8 |
* 15,24 bpp and dithering from Michael Niedermayer (michaelni@gmx.at)
|
9 |
* MMX/MMX2 Template stuff from Michael Niedermayer (needed for fast movntq support)
|
10 |
* context / deglobalize stuff by Michael Niedermayer
|
11 |
*
|
12 |
* This file is part of mpeg2dec, a free MPEG-2 video decoder
|
13 |
*
|
14 |
* mpeg2dec is free software; you can redistribute it and/or modify
|
15 |
* it under the terms of the GNU General Public License as published by
|
16 |
* the Free Software Foundation; either version 2, or (at your option)
|
17 |
* any later version.
|
18 |
*
|
19 |
* mpeg2dec is distributed in the hope that it will be useful,
|
20 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
21 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
22 |
* GNU General Public License for more details.
|
23 |
*
|
24 |
* You should have received a copy of the GNU General Public License
|
25 |
* along with mpeg2dec; if not, write to the Free Software
|
26 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
27 |
*/
|
28 |
|
29 |
#undef MOVNTQ
|
30 |
#undef EMMS
|
31 |
#undef SFENCE
|
32 |
|
33 |
#if HAVE_AMD3DNOW
|
34 |
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
|
35 |
#define EMMS "femms" |
36 |
#else
|
37 |
#define EMMS "emms" |
38 |
#endif
|
39 |
|
40 |
#if HAVE_MMX2
|
41 |
#define MOVNTQ "movntq" |
42 |
#define SFENCE "sfence" |
43 |
#else
|
44 |
#define MOVNTQ "movq" |
45 |
#define SFENCE "/nop" |
46 |
#endif
|
47 |
|
48 |
#define YUV2RGB \
|
49 |
/* Do the multiply part of the conversion for even and odd pixels,
|
50 |
register usage:
|
51 |
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
|
52 |
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
|
53 |
mm6 -> Y even, mm7 -> Y odd */\
|
54 |
/* convert the chroma part */\
|
55 |
"punpcklbw %%mm4, %%mm0;" /* scatter 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ |
56 |
"punpcklbw %%mm4, %%mm1;" /* scatter 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ |
57 |
\ |
58 |
"psllw $3, %%mm0;" /* Promote precision */ \ |
59 |
"psllw $3, %%mm1;" /* Promote precision */ \ |
60 |
\ |
61 |
"psubsw "U_OFFSET"(%4), %%mm0;" /* Cb -= 128 */ \ |
62 |
"psubsw "V_OFFSET"(%4), %%mm1;" /* Cr -= 128 */ \ |
63 |
\ |
64 |
"movq %%mm0, %%mm2;" /* Copy 4 Cb 00 u3 00 u2 00 u1 00 u0 */ \ |
65 |
"movq %%mm1, %%mm3;" /* Copy 4 Cr 00 v3 00 v2 00 v1 00 v0 */ \ |
66 |
\ |
67 |
"pmulhw "UG_COEFF"(%4), %%mm2;" /* Mul Cb with green coeff -> Cb green */ \ |
68 |
"pmulhw "VG_COEFF"(%4), %%mm3;" /* Mul Cr with green coeff -> Cr green */ \ |
69 |
\ |
70 |
"pmulhw "UB_COEFF"(%4), %%mm0;" /* Mul Cb -> Cblue 00 b3 00 b2 00 b1 00 b0 */\ |
71 |
"pmulhw "VR_COEFF"(%4), %%mm1;" /* Mul Cr -> Cred 00 r3 00 r2 00 r1 00 r0 */\ |
72 |
\ |
73 |
"paddsw %%mm3, %%mm2;" /* Cb green + Cr green -> Cgreen */\ |
74 |
\ |
75 |
/* convert the luma part */\
|
76 |
"movq %%mm6, %%mm7;" /* Copy 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ |
77 |
"pand "MANGLE(mmx_00ffw)", %%mm6;" /* get Y even 00 Y6 00 Y4 00 Y2 00 Y0 */\ |
78 |
\ |
79 |
"psrlw $8, %%mm7;" /* get Y odd 00 Y7 00 Y5 00 Y3 00 Y1 */\ |
80 |
\ |
81 |
"psllw $3, %%mm6;" /* Promote precision */\ |
82 |
"psllw $3, %%mm7;" /* Promote precision */\ |
83 |
\ |
84 |
"psubw "Y_OFFSET"(%4), %%mm6;" /* Y -= 16 */\ |
85 |
"psubw "Y_OFFSET"(%4), %%mm7;" /* Y -= 16 */\ |
86 |
\ |
87 |
"pmulhw "Y_COEFF"(%4), %%mm6;" /* Mul 4 Y even 00 y6 00 y4 00 y2 00 y0 */\ |
88 |
"pmulhw "Y_COEFF"(%4), %%mm7;" /* Mul 4 Y odd 00 y7 00 y5 00 y3 00 y1 */\ |
89 |
\ |
90 |
/* Do the addition part of the conversion for even and odd pixels,
|
91 |
register usage:
|
92 |
mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels,
|
93 |
mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels,
|
94 |
mm6 -> Y even, mm7 -> Y odd */\
|
95 |
"movq %%mm0, %%mm3;" /* Copy Cblue */\ |
96 |
"movq %%mm1, %%mm4;" /* Copy Cred */\ |
97 |
"movq %%mm2, %%mm5;" /* Copy Cgreen */\ |
98 |
\ |
99 |
"paddsw %%mm6, %%mm0;" /* Y even + Cblue 00 B6 00 B4 00 B2 00 B0 */\ |
100 |
"paddsw %%mm7, %%mm3;" /* Y odd + Cblue 00 B7 00 B5 00 B3 00 B1 */\ |
101 |
\ |
102 |
"paddsw %%mm6, %%mm1;" /* Y even + Cred 00 R6 00 R4 00 R2 00 R0 */\ |
103 |
"paddsw %%mm7, %%mm4;" /* Y odd + Cred 00 R7 00 R5 00 R3 00 R1 */\ |
104 |
\ |
105 |
"paddsw %%mm6, %%mm2;" /* Y even + Cgreen 00 G6 00 G4 00 G2 00 G0 */\ |
106 |
"paddsw %%mm7, %%mm5;" /* Y odd + Cgreen 00 G7 00 G5 00 G3 00 G1 */\ |
107 |
\ |
108 |
/* Limit RGB even to 0..255 */\
|
109 |
"packuswb %%mm0, %%mm0;" /* B6 B4 B2 B0 B6 B4 B2 B0 */\ |
110 |
"packuswb %%mm1, %%mm1;" /* R6 R4 R2 R0 R6 R4 R2 R0 */\ |
111 |
"packuswb %%mm2, %%mm2;" /* G6 G4 G2 G0 G6 G4 G2 G0 */\ |
112 |
\ |
113 |
/* Limit RGB odd to 0..255 */\
|
114 |
"packuswb %%mm3, %%mm3;" /* B7 B5 B3 B1 B7 B5 B3 B1 */\ |
115 |
"packuswb %%mm4, %%mm4;" /* R7 R5 R3 R1 R7 R5 R3 R1 */\ |
116 |
"packuswb %%mm5, %%mm5;" /* G7 G5 G3 G1 G7 G5 G3 G1 */\ |
117 |
\ |
118 |
/* Interleave RGB even and odd */\
|
119 |
"punpcklbw %%mm3, %%mm0;" /* B7 B6 B5 B4 B3 B2 B1 B0 */\ |
120 |
"punpcklbw %%mm4, %%mm1;" /* R7 R6 R5 R4 R3 R2 R1 R0 */\ |
121 |
"punpcklbw %%mm5, %%mm2;" /* G7 G6 G5 G4 G3 G2 G1 G0 */\ |
122 |
|
123 |
|
124 |
#define YUV422_UNSHIFT \
|
125 |
if(c->srcFormat == PIX_FMT_YUV422P) {\
|
126 |
srcStride[1] *= 2; \ |
127 |
srcStride[2] *= 2; \ |
128 |
} \ |
129 |
|
130 |
#define YUV2RGB_LOOP(depth) \
|
131 |
h_size= (c->dstW+7)&~7; \ |
132 |
if(h_size*depth > FFABS(dstStride[0])) h_size-=8; \ |
133 |
\ |
134 |
__asm__ volatile ("pxor %mm4, %mm4;" /* zero mm4 */ ); \ |
135 |
for (y= 0; y<srcSliceH; y++ ) { \ |
136 |
uint8_t *image = dst[0] + (y+srcSliceY)*dstStride[0]; \ |
137 |
uint8_t *py = src[0] + y*srcStride[0]; \ |
138 |
uint8_t *pu = src[1] + (y>>1)*srcStride[1]; \ |
139 |
uint8_t *pv = src[2] + (y>>1)*srcStride[2]; \ |
140 |
x86_reg index= -h_size/2; \
|
141 |
|
142 |
#define YUV2RGB_INIT \
|
143 |
/* This MMX assembly code deals with a SINGLE scan line at a time, \
|
144 |
* it converts 8 pixels in each iteration. */ \
|
145 |
__asm__ volatile ( \
|
146 |
/* load data for start of next scan line */ \
|
147 |
"movd (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ |
148 |
"movd (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ |
149 |
"movq (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ |
150 |
/* \
|
151 |
".balign 16 \n\t" \
|
152 |
*/ \
|
153 |
"1: \n\t" \
|
154 |
/* No speed difference on my p3@500 with prefetch, \
|
155 |
* if it is faster for anyone with -benchmark then tell me. \
|
156 |
PREFETCH" 64(%0) \n\t" \
|
157 |
PREFETCH" 64(%1) \n\t" \
|
158 |
PREFETCH" 64(%2) \n\t" \
|
159 |
*/ \
|
160 |
|
161 |
#define YUV2RGB_ENDLOOP(depth) \
|
162 |
"add $"AV_STRINGIFY(depth*8)", %1 \n\t" \ |
163 |
"add $4, %0 \n\t" \
|
164 |
" js 1b \n\t" \
|
165 |
|
166 |
#define YUV2RGB_OPERANDS \
|
167 |
: "+r" (index), "+r" (image) \ |
168 |
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index) \ |
169 |
); \ |
170 |
} \ |
171 |
__asm__ volatile (SFENCE"\n\t"EMMS); \ |
172 |
return srcSliceH; \
|
173 |
|
174 |
#define YUV2RGB_OPERANDS_ALPHA \
|
175 |
: "+r" (index), "+r" (image) \ |
176 |
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), "r" (py - 2*index), "r" (pa - 2*index) \ |
177 |
); \ |
178 |
} \ |
179 |
__asm__ volatile (SFENCE"\n\t"EMMS); \ |
180 |
return srcSliceH; \
|
181 |
|
182 |
static inline int RENAME(yuv420_rgb16)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
183 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
184 |
{ |
185 |
int y, h_size;
|
186 |
|
187 |
YUV422_UNSHIFT |
188 |
YUV2RGB_LOOP(2)
|
189 |
|
190 |
c->blueDither= ff_dither8[y&1];
|
191 |
c->greenDither= ff_dither4[y&1];
|
192 |
c->redDither= ff_dither8[(y+1)&1]; |
193 |
|
194 |
YUV2RGB_INIT |
195 |
YUV2RGB |
196 |
|
197 |
#ifdef DITHER1XBPP
|
198 |
"paddusb "BLUE_DITHER"(%4), %%mm0;" |
199 |
"paddusb "GREEN_DITHER"(%4), %%mm2;" |
200 |
"paddusb "RED_DITHER"(%4), %%mm1;" |
201 |
#endif
|
202 |
/* mask unneeded bits off */
|
203 |
"pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ |
204 |
"pand "MANGLE(mmx_grnmask)", %%mm2;" /* g7g6g5g4 g3g2_0_0 g7g6g5g4 g3g2_0_0 */ |
205 |
"pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ |
206 |
|
207 |
"psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ |
208 |
"pxor %%mm4, %%mm4;" /* zero mm4 */ |
209 |
|
210 |
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
211 |
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
212 |
|
213 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
214 |
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
215 |
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
216 |
|
217 |
"psllw $3, %%mm2;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ |
218 |
"por %%mm2, %%mm0;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
219 |
|
220 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
221 |
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ |
222 |
|
223 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
224 |
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3g2_0_0 */ |
225 |
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
226 |
|
227 |
"psllw $3, %%mm7;" /* 0_0_0_0 0_g7g6g5 g4g3g2_0 0_0_0_0 */ |
228 |
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
229 |
|
230 |
"por %%mm7, %%mm5;" /* r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3 */ |
231 |
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
232 |
|
233 |
MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ |
234 |
|
235 |
YUV2RGB_ENDLOOP(2)
|
236 |
YUV2RGB_OPERANDS |
237 |
} |
238 |
|
239 |
static inline int RENAME(yuv420_rgb15)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
240 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
241 |
{ |
242 |
int y, h_size;
|
243 |
|
244 |
YUV422_UNSHIFT |
245 |
YUV2RGB_LOOP(2)
|
246 |
|
247 |
c->blueDither= ff_dither8[y&1];
|
248 |
c->greenDither= ff_dither8[y&1];
|
249 |
c->redDither= ff_dither8[(y+1)&1]; |
250 |
|
251 |
YUV2RGB_INIT |
252 |
YUV2RGB |
253 |
|
254 |
#ifdef DITHER1XBPP
|
255 |
"paddusb "BLUE_DITHER"(%4), %%mm0 \n\t" |
256 |
"paddusb "GREEN_DITHER"(%4), %%mm2 \n\t" |
257 |
"paddusb "RED_DITHER"(%4), %%mm1 \n\t" |
258 |
#endif
|
259 |
|
260 |
/* mask unneeded bits off */
|
261 |
"pand "MANGLE(mmx_redmask)", %%mm0;" /* b7b6b5b4 b3_0_0_0 b7b6b5b4 b3_0_0_0 */ |
262 |
"pand "MANGLE(mmx_redmask)", %%mm2;" /* g7g6g5g4 g3_0_0_0 g7g6g5g4 g3_0_0_0 */ |
263 |
"pand "MANGLE(mmx_redmask)", %%mm1;" /* r7r6r5r4 r3_0_0_0 r7r6r5r4 r3_0_0_0 */ |
264 |
|
265 |
"psrlw $3, %%mm0;" /* 0_0_0_b7 b6b5b4b3 0_0_0_b7 b6b5b4b3 */ |
266 |
"psrlw $1, %%mm1;" /* 0_r7r6r5 r4r3_0_0 0_r7r6r5 r4r3_0_0 */ |
267 |
"pxor %%mm4, %%mm4;" /* zero mm4 */ |
268 |
|
269 |
"movq %%mm0, %%mm5;" /* Copy B7-B0 */ |
270 |
"movq %%mm2, %%mm7;" /* Copy G7-G0 */ |
271 |
|
272 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
273 |
"punpcklbw %%mm4, %%mm2;" /* 0_0_0_0 0_0_0_0 g7g6g5g4 g3_0_0_0 */ |
274 |
"punpcklbw %%mm1, %%mm0;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
275 |
|
276 |
"psllw $2, %%mm2;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ |
277 |
"por %%mm2, %%mm0;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ |
278 |
|
279 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ |
280 |
MOVNTQ " %%mm0, (%1);" /* store pixel 0-3 */ |
281 |
|
282 |
/* convert RGB24 plane to RGB16 pack for pixel 0-3 */
|
283 |
"punpckhbw %%mm4, %%mm7;" /* 0_0_0_0 0_0_0_0 0_g7g6g5 g4g3_0_0 */ |
284 |
"punpckhbw %%mm1, %%mm5;" /* r7r6r5r4 r3_0_0_0 0_0_0_b7 b6b5b4b3 */ |
285 |
|
286 |
"psllw $2, %%mm7;" /* 0_0_0_0 0_0_g7g6 g5g4g3_0 0_0_0_0 */ |
287 |
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ |
288 |
|
289 |
"por %%mm7, %%mm5;" /* 0_r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3 */ |
290 |
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ |
291 |
|
292 |
MOVNTQ " %%mm5, 8 (%1);" /* store pixel 4-7 */ |
293 |
|
294 |
YUV2RGB_ENDLOOP(2)
|
295 |
YUV2RGB_OPERANDS |
296 |
} |
297 |
|
298 |
#undef RGB_PLANAR2PACKED24
|
299 |
#if HAVE_MMX2
|
300 |
#define RGB_PLANAR2PACKED24(red, blue)\
|
301 |
"movq "MANGLE(ff_M24A)", %%mm4 \n\t"\ |
302 |
"movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ |
303 |
"pshufw $0x50, %%mm"blue", %%mm5 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
304 |
"pshufw $0x50, %%mm2, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ |
305 |
"pshufw $0x00, %%mm"red", %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ |
306 |
\ |
307 |
"pand %%mm4, %%mm5 \n\t" /* B2 B1 B0 */\ |
308 |
"pand %%mm4, %%mm3 \n\t" /* G2 G1 G0 */\ |
309 |
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ |
310 |
\ |
311 |
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ |
312 |
"por %%mm5, %%mm6 \n\t"\
|
313 |
"por %%mm3, %%mm6 \n\t"\
|
314 |
MOVNTQ" %%mm6, (%1) \n\t"\
|
315 |
\ |
316 |
"psrlq $8, %%mm2 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ |
317 |
"pshufw $0xA5, %%mm"blue", %%mm5\n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ |
318 |
"pshufw $0x55, %%mm2, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ |
319 |
"pshufw $0xA5, %%mm"red", %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ |
320 |
\ |
321 |
"pand "MANGLE(ff_M24B)", %%mm5 \n\t" /* B5 B4 B3 */\ |
322 |
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
323 |
"pand %%mm4, %%mm6 \n\t" /* R4 R3 R2 */\ |
324 |
\ |
325 |
"por %%mm5, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ |
326 |
"por %%mm3, %%mm6 \n\t"\
|
327 |
MOVNTQ" %%mm6, 8(%1) \n\t"\
|
328 |
\ |
329 |
"pshufw $0xFF, %%mm"blue", %%mm5\n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ |
330 |
"pshufw $0xFA, %%mm2, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ |
331 |
"pshufw $0xFA, %%mm"red", %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ |
332 |
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */\ |
333 |
\ |
334 |
"pand %%mm7, %%mm5 \n\t" /* B7 B6 */\ |
335 |
"pand %%mm4, %%mm3 \n\t" /* G7 G6 G5 */\ |
336 |
"pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
337 |
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */\ |
338 |
\ |
339 |
"por %%mm5, %%mm3 \n\t"\
|
340 |
"por %%mm3, %%mm6 \n\t"\
|
341 |
MOVNTQ" %%mm6, 16(%1) \n\t"\
|
342 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ |
343 |
"pxor %%mm4, %%mm4 \n\t"
|
344 |
#else
|
345 |
#define RGB_PLANAR2PACKED24(red, blue)\
|
346 |
"pxor %%mm4, %%mm4 \n\t"\
|
347 |
"movq %%mm"blue", %%mm5\n\t" /* B */\ |
348 |
"movq %%mm"red", %%mm6 \n\t" /* R */\ |
349 |
"punpcklbw %%mm2, %%mm"blue"\n\t" /* GBGBGBGB 0 */\ |
350 |
"punpcklbw %%mm4, %%mm"red" \n\t" /* 0R0R0R0R 0 */\ |
351 |
"punpckhbw %%mm2, %%mm5 \n\t" /* GBGBGBGB 2 */\ |
352 |
"punpckhbw %%mm4, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
353 |
"movq %%mm"blue", %%mm7\n\t" /* GBGBGBGB 0 */\ |
354 |
"movq %%mm5, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
355 |
"punpcklwd %%mm"red", %%mm7 \n\t" /* 0RGB0RGB 0 */\ |
356 |
"punpckhwd %%mm"red", %%mm"blue"\n\t" /* 0RGB0RGB 1 */\ |
357 |
"punpcklwd %%mm6, %%mm5 \n\t" /* 0RGB0RGB 2 */\ |
358 |
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
359 |
\ |
360 |
"movq %%mm7, %%mm2 \n\t" /* 0RGB0RGB 0 */\ |
361 |
"movq %%mm"blue", %%mm6\n\t" /* 0RGB0RGB 1 */\ |
362 |
"movq %%mm5, %%mm"red" \n\t" /* 0RGB0RGB 2 */\ |
363 |
"movq %%mm3, %%mm4 \n\t" /* 0RGB0RGB 3 */\ |
364 |
\ |
365 |
"psllq $40, %%mm7 \n\t" /* RGB00000 0 */\ |
366 |
"psllq $40, %%mm"blue"\n\t" /* RGB00000 1 */\ |
367 |
"psllq $40, %%mm5 \n\t" /* RGB00000 2 */\ |
368 |
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ |
369 |
\ |
370 |
"punpckhdq %%mm2, %%mm7 \n\t" /* 0RGBRGB0 0 */\ |
371 |
"punpckhdq %%mm6, %%mm"blue"\n\t" /* 0RGBRGB0 1 */\ |
372 |
"punpckhdq %%mm"red", %%mm5 \n\t" /* 0RGBRGB0 2 */\ |
373 |
"punpckhdq %%mm4, %%mm3 \n\t" /* 0RGBRGB0 3 */\ |
374 |
\ |
375 |
"psrlq $8, %%mm7 \n\t" /* 00RGBRGB 0 */\ |
376 |
"movq %%mm"blue", %%mm6\n\t" /* 0RGBRGB0 1 */\ |
377 |
"psllq $40, %%mm"blue"\n\t" /* GB000000 1 */\ |
378 |
"por %%mm"blue", %%mm7\n\t" /* GBRGBRGB 0 */\ |
379 |
MOVNTQ" %%mm7, (%1) \n\t"\
|
380 |
\ |
381 |
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ |
382 |
"movq %%mm5, %%mm"red" \n\t" /* 0RGBRGB0 2 */\ |
383 |
"psllq $24, %%mm5 \n\t" /* BRGB0000 2 */\ |
384 |
"por %%mm5, %%mm6 \n\t" /* BRGBRGBR 1 */\ |
385 |
MOVNTQ" %%mm6, 8(%1) \n\t"\
|
386 |
\ |
387 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */\ |
388 |
\ |
389 |
"psrlq $40, %%mm"red" \n\t" /* 000000RG 2 */\ |
390 |
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ |
391 |
"por %%mm3, %%mm"red" \n\t" /* RGBRGBRG 2 */\ |
392 |
MOVNTQ" %%mm"red", 16(%1)\n\t"\ |
393 |
\ |
394 |
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */\ |
395 |
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */\ |
396 |
"pxor %%mm4, %%mm4 \n\t"
|
397 |
#endif
|
398 |
|
399 |
static inline int RENAME(yuv420_rgb24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
400 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
401 |
{ |
402 |
int y, h_size;
|
403 |
|
404 |
YUV422_UNSHIFT |
405 |
YUV2RGB_LOOP(3)
|
406 |
|
407 |
YUV2RGB_INIT |
408 |
YUV2RGB |
409 |
/* mm0=B, %%mm2=G, %%mm1=R */
|
410 |
RGB_PLANAR2PACKED24("0", "1") |
411 |
|
412 |
YUV2RGB_ENDLOOP(3)
|
413 |
YUV2RGB_OPERANDS |
414 |
} |
415 |
|
416 |
static inline int RENAME(yuv420_bgr24)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
417 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
418 |
{ |
419 |
int y, h_size;
|
420 |
|
421 |
YUV422_UNSHIFT |
422 |
YUV2RGB_LOOP(3)
|
423 |
|
424 |
YUV2RGB_INIT |
425 |
YUV2RGB |
426 |
/* mm0=B, %%mm2=G, %%mm1=R */
|
427 |
RGB_PLANAR2PACKED24("1", "0") |
428 |
|
429 |
YUV2RGB_ENDLOOP(3)
|
430 |
YUV2RGB_OPERANDS |
431 |
} |
432 |
|
433 |
/*
|
434 |
|
435 |
RGB_PLANAR2PACKED32(red,green,blue,alpha)
|
436 |
|
437 |
convert RGB plane to RGB packed format
|
438 |
|
439 |
macro parameters specify the output color channel order:
|
440 |
|
441 |
RGB_PLANAR2PACKED32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA) for RGBA output,
|
442 |
RGB_PLANAR2PACKED32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA) for BGRA output,
|
443 |
RGB_PLANAR2PACKED32(REG_ALPHA,REG_BLUE, REG_GREEN,REG_RED) for ABGR output,
|
444 |
|
445 |
etc.
|
446 |
*/
|
447 |
|
448 |
#define REG_BLUE "0" |
449 |
#define REG_RED "1" |
450 |
#define REG_GREEN "2" |
451 |
#define REG_ALPHA "3" |
452 |
|
453 |
#define RGB_PLANAR2PACKED32(red,green,blue,alpha) \
|
454 |
/* convert RGB plane to RGB packed format, \
|
455 |
mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> A, \
|
456 |
mm4 -> GB, mm5 -> AR pixel 4-7, \
|
457 |
mm6 -> GB, mm7 -> AR pixel 0-3 */ \
|
458 |
"movq %%mm" blue ", %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ \ |
459 |
"movq %%mm" red ", %%mm7;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ \ |
460 |
\ |
461 |
"movq %%mm" blue ", %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ \ |
462 |
"movq %%mm" red ", %%mm5;" /* R7 R6 R5 R4 R3 R2 R1 R0 */ \ |
463 |
\ |
464 |
"punpcklbw %%mm" green ", %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ |
465 |
"punpcklbw %%mm" alpha ", %%mm7;" /* A3 R3 A2 R2 A1 R1 A0 R0 */ \ |
466 |
\ |
467 |
"punpcklwd %%mm7, %%mm6;" /* A1 R1 B1 G1 A0 R0 B0 G0 */ \ |
468 |
MOVNTQ " %%mm6, (%1);" /* Store ARGB1 ARGB0 */ \ |
469 |
\ |
470 |
"movq %%mm" blue ", %%mm6;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ \ |
471 |
"punpcklbw %%mm" green ", %%mm6;" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ |
472 |
\ |
473 |
"punpckhwd %%mm7, %%mm6;" /* A3 R3 G3 B3 A2 R2 B3 G2 */ \ |
474 |
MOVNTQ " %%mm6, 8 (%1);" /* Store ARGB3 ARGB2 */ \ |
475 |
\ |
476 |
"punpckhbw %%mm" green ", %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ \ |
477 |
"punpckhbw %%mm" alpha ", %%mm5;" /* A7 R7 A6 R6 A5 R5 A4 R4 */ \ |
478 |
\ |
479 |
"punpcklwd %%mm5, %%mm4;" /* A5 R5 B5 G5 A4 R4 B4 G4 */ \ |
480 |
MOVNTQ " %%mm4, 16 (%1);" /* Store ARGB5 ARGB4 */ \ |
481 |
\ |
482 |
"movq %%mm" blue ", %%mm4;" /* B7 B6 B5 B4 B3 B2 B1 B0 */ \ |
483 |
"punpckhbw %%mm" green ", %%mm4;" /* G7 B7 G6 B6 G5 B5 G4 B4 */ \ |
484 |
\ |
485 |
"punpckhwd %%mm5, %%mm4;" /* A7 R7 G7 B7 A6 R6 B6 G6 */ \ |
486 |
MOVNTQ " %%mm4, 24 (%1);" /* Store ARGB7 ARGB6 */ \ |
487 |
\ |
488 |
"movd 4 (%2, %0), %%mm0;" /* Load 4 Cb 00 00 00 00 u3 u2 u1 u0 */ \ |
489 |
"movd 4 (%3, %0), %%mm1;" /* Load 4 Cr 00 00 00 00 v3 v2 v1 v0 */ \ |
490 |
\ |
491 |
"pxor %%mm4, %%mm4;" /* zero mm4 */ \ |
492 |
"movq 8 (%5, %0, 2), %%mm6;" /* Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ \ |
493 |
|
494 |
static inline int RENAME(yuv420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
495 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
496 |
{ |
497 |
int y, h_size;
|
498 |
|
499 |
YUV422_UNSHIFT |
500 |
YUV2RGB_LOOP(4)
|
501 |
|
502 |
YUV2RGB_INIT |
503 |
YUV2RGB |
504 |
"pcmpeqd %%mm3, %%mm3;" /* fill mm3 */ |
505 |
RGB_PLANAR2PACKED32(REG_RED,REG_GREEN,REG_BLUE,REG_ALPHA) |
506 |
|
507 |
YUV2RGB_ENDLOOP(4)
|
508 |
YUV2RGB_OPERANDS |
509 |
} |
510 |
|
511 |
static inline int RENAME(yuva420_rgb32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
512 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
513 |
{ |
514 |
#if HAVE_7REGS
|
515 |
int y, h_size;
|
516 |
|
517 |
YUV2RGB_LOOP(4)
|
518 |
|
519 |
uint8_t *pa = src[3] + y*srcStride[3]; |
520 |
YUV2RGB_INIT |
521 |
YUV2RGB |
522 |
"movq (%6, %0, 2), %%mm3;" /* Load 8 A A7 A6 A5 A4 A3 A2 A1 A0 */ |
523 |
RGB_PLANAR2PACKED32(REG_RED,REG_GREEN,REG_BLUE,REG_ALPHA) |
524 |
|
525 |
YUV2RGB_ENDLOOP(4)
|
526 |
YUV2RGB_OPERANDS_ALPHA |
527 |
#endif
|
528 |
} |
529 |
|
530 |
static inline int RENAME(yuv420_bgr32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
531 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
532 |
{ |
533 |
int y, h_size;
|
534 |
|
535 |
YUV422_UNSHIFT |
536 |
YUV2RGB_LOOP(4)
|
537 |
|
538 |
YUV2RGB_INIT |
539 |
YUV2RGB |
540 |
"pcmpeqd %%mm3, %%mm3;" /* fill mm3 */ |
541 |
RGB_PLANAR2PACKED32(REG_BLUE,REG_GREEN,REG_RED,REG_ALPHA) |
542 |
|
543 |
YUV2RGB_ENDLOOP(4)
|
544 |
YUV2RGB_OPERANDS |
545 |
} |
546 |
|
547 |
static inline int RENAME(yuva420_bgr32)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY, |
548 |
int srcSliceH, uint8_t* dst[], int dstStride[]) |
549 |
{ |
550 |
#if HAVE_7REGS
|
551 |
int y, h_size;
|
552 |
|
553 |
YUV2RGB_LOOP(4)
|
554 |
|
555 |
uint8_t *pa = src[3] + y*srcStride[3]; |
556 |
YUV2RGB_INIT |
557 |
YUV2RGB |
558 |
"movq (%6, %0, 2), %%mm3;" /* Load 8 A A7 A6 A5 A4 A3 A2 A1 A0 */ |
559 |
RGB_PLANAR2PACKED32(REG_BLUE,REG_GREEN,REG_RED,REG_ALPHA) |
560 |
|
561 |
YUV2RGB_ENDLOOP(4)
|
562 |
YUV2RGB_OPERANDS_ALPHA |
563 |
#endif
|
564 |
} |