Revision d604bab9 postproc/swscale.c
postproc/swscale.c  

3  3  
4  4 
// Orginal C implementation by A'rpi/ESPteam <arpi@thot.banki.hu> 
5  5 
// current version mostly by Michael Niedermayer (michaelni@gmx.at) 
6 
// the parts written by michael are under GNU GPL 

6  7  
7  8 
#include <inttypes.h> 
8  9 
#include "../config.h" 
10 
#include "swscale.h" 

9  11  
10  12 
//#undef HAVE_MMX2 
11  13 
//#undef HAVE_MMX 
12  14 
//#undef ARCH_X86 
13 
#define DITHER16BPP 

14 
//#define ALT_ERROR 

15 
#define DITHER1XBPP 

16 
int fullUVIpol=0; 

17 
//disables the unscaled height version 

18 
int allwaysIpol=0; 

15  19  
16  20 
#define RET 0xC3 //near return opcode 
17  21 
/* 
18  22 
NOTES 
19  23  
20 
known BUGS with known cause (no bugreports please!) 

21 
code reads 1 sample too much (might cause a sig11) 

24 
known BUGS with known cause (no bugreports please!, but patches are welcome :) ) 

25 
horizontal MMX2 scaler reads 17 samples too much (might cause a sig11) 

26  
27 
Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested) 

28 
BGR15 & BGR16 MMX verions support dithering 

29 
Special versions: fast Y 1:1 scaling (no interpolation in y direction) 

22  30  
23  31 
TODO 
24 
check alignment off everything


32 
more intelligent missalignment avoidance for the horizontal scaler


25  33 
*/ 
26  34  
27 
static uint64_t yCoeff= 0x2568256825682568LL; 

28 
static uint64_t ubCoeff= 0x3343334333433343LL; 

29 
static uint64_t vrCoeff= 0x40cf40cf40cf40cfLL; 

30 
static uint64_t ugCoeff= 0xE5E2E5E2E5E2E5E2LL; 

31 
static uint64_t vgCoeff= 0xF36EF36EF36EF36ELL; 

32 
static uint64_t w80= 0x0080008000800080LL; 

33 
static uint64_t w10= 0x0010001000100010LL; 

34 
static uint64_t bm00000111=0x0000000000FFFFFFLL; 

35 
static uint64_t bm11111000=0xFFFFFFFFFF000000LL; 

36  
37 
static uint64_t b16Dither= 0x0004000400040004LL; 

38 
static uint64_t b16Dither1=0x0004000400040004LL; 

39 
static uint64_t b16Dither2=0x0602060206020602LL; 

40 
static uint64_t g16Dither= 0x0002000200020002LL; 

41 
static uint64_t g16Dither1=0x0002000200020002LL; 

42 
static uint64_t g16Dither2=0x0301030103010301LL; 

43  
44 
static uint64_t b16Mask= 0x001F001F001F001FLL; 

45 
static uint64_t g16Mask= 0x07E007E007E007E0LL; 

46 
static uint64_t r16Mask= 0xF800F800F800F800LL; 

47 
static uint64_t temp0; 

35 
#define ABS(a) ((a) > 0 ? (a) : ((a))) 

36  
37 
#ifdef HAVE_MMX2 

38 
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 

39 
#elif defined (HAVE_3DNOW) 

40 
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 

41 
#endif 

48  42  
43 
#ifdef HAVE_MMX2 

44 
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 

45 
#else 

46 
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 

47 
#endif 

48  
49  
50 
#ifdef HAVE_MMX 

51 
static uint64_t __attribute__((aligned(8))) yCoeff= 0x2568256825682568LL; 

52 
static uint64_t __attribute__((aligned(8))) ubCoeff= 0x3343334333433343LL; 

53 
static uint64_t __attribute__((aligned(8))) vrCoeff= 0x40cf40cf40cf40cfLL; 

54 
static uint64_t __attribute__((aligned(8))) ugCoeff= 0xE5E2E5E2E5E2E5E2LL; 

55 
static uint64_t __attribute__((aligned(8))) vgCoeff= 0xF36EF36EF36EF36ELL; 

56 
static uint64_t __attribute__((aligned(8))) w400= 0x0400040004000400LL; 

57 
static uint64_t __attribute__((aligned(8))) w80= 0x0080008000800080LL; 

58 
static uint64_t __attribute__((aligned(8))) w10= 0x0010001000100010LL; 

59 
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL; 

60 
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL; 

61 
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL; 

62  
63 
static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL; 

64 
static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL; 

65 
static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL; 

66 
static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL; 

67 
static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL; 

68 
static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL; 

69  
70 
static uint64_t __attribute__((aligned(8))) b16Mask= 0x001F001F001F001FLL; 

71 
static uint64_t __attribute__((aligned(8))) g16Mask= 0x07E007E007E007E0LL; 

72 
static uint64_t __attribute__((aligned(8))) r16Mask= 0xF800F800F800F800LL; 

73 
static uint64_t __attribute__((aligned(8))) b15Mask= 0x001F001F001F001FLL; 

74 
static uint64_t __attribute__((aligned(8))) g15Mask= 0x03E003E003E003E0LL; 

75 
static uint64_t __attribute__((aligned(8))) r15Mask= 0x7C007C007C007C00LL; 

76  
77 
static uint64_t __attribute__((aligned(8))) temp0; 

78 
static uint64_t __attribute__((aligned(8))) asm_yalpha1; 

79 
static uint64_t __attribute__((aligned(8))) asm_uvalpha1; 

80 
#endif 

49  81  
50  82 
// temporary storage for 4 yuv lines: 
51  83 
// 16bit for now (mmx likes it more compact) 
84 
#ifdef HAVE_MMX 

85 
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048]; 

86 
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2]; 

87 
#else 

52  88 
static uint16_t pix_buf_y[4][2048]; 
53  89 
static uint16_t pix_buf_uv[2][2048*2]; 
90 
#endif 

54  91  
55  92 
// clipping helper table for C implementations: 
56  93 
static unsigned char clip_table[768]; 
...  ...  
66  103 
static uint8_t funnyYCode[10000]; 
67  104 
static uint8_t funnyUVCode[10000]; 
68  105  
106 
#define FULL_YSCALEYUV2RGB \ 

107 
"pxor %%mm7, %%mm7 \n\t"\ 

108 
"movd %6, %%mm6 \n\t" /*yalpha1*/\ 

109 
"punpcklwd %%mm6, %%mm6 \n\t"\ 

110 
"punpcklwd %%mm6, %%mm6 \n\t"\ 

111 
"movd %7, %%mm5 \n\t" /*uvalpha1*/\ 

112 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

113 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

114 
"xorl %%eax, %%eax \n\t"\ 

115 
"1: \n\t"\ 

116 
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ 

117 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ 

118 
"movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ 

119 
"movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ 

120 
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax]  buf1[eax]*/\ 

121 
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax]  uvbuf1[eax]*/\ 

122 
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\ 

123 
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax]  uvbuf1[eax])uvalpha1>>16*/\ 

124 
"psraw $4, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>4*/\ 

125 
"movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 

126 
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>4*/\ 

127 
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\ 

128 
"movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ 

129 
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1  uvbuf1[eax](1uvalpha1)*/\ 

130 
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048]*/\ 

131 
"psubw w80, %%mm1 \n\t" /* 8(Y16)*/\ 

132 
"psubw w400, %%mm3 \n\t" /* 8(U128)*/\ 

133 
"pmulhw yCoeff, %%mm1 \n\t"\ 

134 
\ 

135 
\ 

136 
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048]  uvbuf1[eax+2048])uvalpha1>>16*/\ 

137 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\ 

138 
"pmulhw ubCoeff, %%mm3 \n\t"\ 

139 
"psraw $4, %%mm0 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>4*/\ 

140 
"pmulhw ugCoeff, %%mm2 \n\t"\ 

141 
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1  uvbuf1[eax+2048](1uvalpha1)*/\ 

142 
"psubw w400, %%mm0 \n\t" /* (V128)8*/\ 

143 
\ 

144 
\ 

145 
"movq %%mm0, %%mm4 \n\t" /* (V128)8*/\ 

146 
"pmulhw vrCoeff, %%mm0 \n\t"\ 

147 
"pmulhw vgCoeff, %%mm4 \n\t"\ 

148 
"paddw %%mm1, %%mm3 \n\t" /* B*/\ 

149 
"paddw %%mm1, %%mm0 \n\t" /* R*/\ 

150 
"packuswb %%mm3, %%mm3 \n\t"\ 

151 
\ 

152 
"packuswb %%mm0, %%mm0 \n\t"\ 

153 
"paddw %%mm4, %%mm2 \n\t"\ 

154 
"paddw %%mm2, %%mm1 \n\t" /* G*/\ 

155 
\ 

156 
"packuswb %%mm1, %%mm1 \n\t" 

157  
158 
#define YSCALEYUV2RGB \ 

159 
"movd %6, %%mm6 \n\t" /*yalpha1*/\ 

160 
"punpcklwd %%mm6, %%mm6 \n\t"\ 

161 
"punpcklwd %%mm6, %%mm6 \n\t"\ 

162 
"movq %%mm6, asm_yalpha1 \n\t"\ 

163 
"movd %7, %%mm5 \n\t" /*uvalpha1*/\ 

164 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

165 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

166 
"movq %%mm5, asm_uvalpha1 \n\t"\ 

167 
"xorl %%eax, %%eax \n\t"\ 

168 
"1: \n\t"\ 

169 
"movq (%2, %%eax), %%mm2 \n\t" /* uvbuf0[eax]*/\ 

170 
"movq (%3, %%eax), %%mm3 \n\t" /* uvbuf1[eax]*/\ 

171 
"movq 4096(%2, %%eax), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ 

172 
"movq 4096(%3, %%eax), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ 

173 
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax]  uvbuf1[eax]*/\ 

174 
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048]*/\ 

175 
"movq asm_uvalpha1, %%mm0 \n\t"\ 

176 
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax]  uvbuf1[eax])uvalpha1>>16*/\ 

177 
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048]  uvbuf1[eax+2048])uvalpha1>>16*/\ 

178 
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>4*/\ 

179 
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>4*/\ 

180 
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1  uvbuf1[eax](1uvalpha1)*/\ 

181 
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1  uvbuf1[eax+2048](1uvalpha1)*/\ 

182 
"psubw w400, %%mm3 \n\t" /* (U128)8*/\ 

183 
"psubw w400, %%mm4 \n\t" /* (V128)8*/\ 

184 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\ 

185 
"movq %%mm4, %%mm5 \n\t" /* (V128)8*/\ 

186 
"pmulhw ugCoeff, %%mm3 \n\t"\ 

187 
"pmulhw vgCoeff, %%mm4 \n\t"\ 

188 
/* mm2=(U128)8, mm3=ug, mm4=vg mm5=(V128)8 */\ 

189 
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ 

190 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ 

191 
"movq 8(%0, %%eax, 2), %%mm6 \n\t" /*buf0[eax]*/\ 

192 
"movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf1[eax]*/\ 

193 
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax]  buf1[eax]*/\ 

194 
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax]  buf1[eax]*/\ 

195 
"pmulhw asm_yalpha1, %%mm0 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\ 

196 
"pmulhw asm_yalpha1, %%mm6 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\ 

197 
"psraw $4, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>4*/\ 

198 
"psraw $4, %%mm7 \n\t" /* buf0[eax]  buf1[eax] >>4*/\ 

199 
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\ 

200 
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\ 

201 
"pmulhw ubCoeff, %%mm2 \n\t"\ 

202 
"pmulhw vrCoeff, %%mm5 \n\t"\ 

203 
"psubw w80, %%mm1 \n\t" /* 8(Y16)*/\ 

204 
"psubw w80, %%mm7 \n\t" /* 8(Y16)*/\ 

205 
"pmulhw yCoeff, %%mm1 \n\t"\ 

206 
"pmulhw yCoeff, %%mm7 \n\t"\ 

207 
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 

208 
"paddw %%mm3, %%mm4 \n\t"\ 

209 
"movq %%mm2, %%mm0 \n\t"\ 

210 
"movq %%mm5, %%mm6 \n\t"\ 

211 
"movq %%mm4, %%mm3 \n\t"\ 

212 
"punpcklwd %%mm2, %%mm2 \n\t"\ 

213 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

214 
"punpcklwd %%mm4, %%mm4 \n\t"\ 

215 
"paddw %%mm1, %%mm2 \n\t"\ 

216 
"paddw %%mm1, %%mm5 \n\t"\ 

217 
"paddw %%mm1, %%mm4 \n\t"\ 

218 
"punpckhwd %%mm0, %%mm0 \n\t"\ 

219 
"punpckhwd %%mm6, %%mm6 \n\t"\ 

220 
"punpckhwd %%mm3, %%mm3 \n\t"\ 

221 
"paddw %%mm7, %%mm0 \n\t"\ 

222 
"paddw %%mm7, %%mm6 \n\t"\ 

223 
"paddw %%mm7, %%mm3 \n\t"\ 

224 
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 

225 
"packuswb %%mm0, %%mm2 \n\t"\ 

226 
"packuswb %%mm6, %%mm5 \n\t"\ 

227 
"packuswb %%mm3, %%mm4 \n\t"\ 

228 
"pxor %%mm7, %%mm7 \n\t" 

229  
230 
#define YSCALEYUV2RGB1 \ 

231 
"xorl %%eax, %%eax \n\t"\ 

232 
"1: \n\t"\ 

233 
"movq (%2, %%eax), %%mm3 \n\t" /* uvbuf0[eax]*/\ 

234 
"movq 4096(%2, %%eax), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 

235 
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>4*/\ 

236 
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>4*/\ 

237 
"psubw w400, %%mm3 \n\t" /* (U128)8*/\ 

238 
"psubw w400, %%mm4 \n\t" /* (V128)8*/\ 

239 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\ 

240 
"movq %%mm4, %%mm5 \n\t" /* (V128)8*/\ 

241 
"pmulhw ugCoeff, %%mm3 \n\t"\ 

242 
"pmulhw vgCoeff, %%mm4 \n\t"\ 

243 
/* mm2=(U128)8, mm3=ug, mm4=vg mm5=(V128)8 */\ 

244 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf0[eax]*/\ 

245 
"movq 8(%1, %%eax, 2), %%mm7 \n\t" /*buf0[eax]*/\ 

246 
"psraw $4, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>4*/\ 

247 
"psraw $4, %%mm7 \n\t" /* buf0[eax]  buf1[eax] >>4*/\ 

248 
"pmulhw ubCoeff, %%mm2 \n\t"\ 

249 
"pmulhw vrCoeff, %%mm5 \n\t"\ 

250 
"psubw w80, %%mm1 \n\t" /* 8(Y16)*/\ 

251 
"psubw w80, %%mm7 \n\t" /* 8(Y16)*/\ 

252 
"pmulhw yCoeff, %%mm1 \n\t"\ 

253 
"pmulhw yCoeff, %%mm7 \n\t"\ 

254 
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ 

255 
"paddw %%mm3, %%mm4 \n\t"\ 

256 
"movq %%mm2, %%mm0 \n\t"\ 

257 
"movq %%mm5, %%mm6 \n\t"\ 

258 
"movq %%mm4, %%mm3 \n\t"\ 

259 
"punpcklwd %%mm2, %%mm2 \n\t"\ 

260 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

261 
"punpcklwd %%mm4, %%mm4 \n\t"\ 

262 
"paddw %%mm1, %%mm2 \n\t"\ 

263 
"paddw %%mm1, %%mm5 \n\t"\ 

264 
"paddw %%mm1, %%mm4 \n\t"\ 

265 
"punpckhwd %%mm0, %%mm0 \n\t"\ 

266 
"punpckhwd %%mm6, %%mm6 \n\t"\ 

267 
"punpckhwd %%mm3, %%mm3 \n\t"\ 

268 
"paddw %%mm7, %%mm0 \n\t"\ 

269 
"paddw %%mm7, %%mm6 \n\t"\ 

270 
"paddw %%mm7, %%mm3 \n\t"\ 

271 
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ 

272 
"packuswb %%mm0, %%mm2 \n\t"\ 

273 
"packuswb %%mm6, %%mm5 \n\t"\ 

274 
"packuswb %%mm3, %%mm4 \n\t"\ 

275 
"pxor %%mm7, %%mm7 \n\t" 

276  
277 
#define WRITEBGR32 \ 

278 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 

279 
"movq %%mm2, %%mm1 \n\t" /* B */\ 

280 
"movq %%mm5, %%mm6 \n\t" /* R */\ 

281 
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 

282 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 

283 
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 

284 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 

285 
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 

286 
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 

287 
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 

288 
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 

289 
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 

290 
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 

291 
\ 

292 
MOVNTQ(%%mm0, (%4, %%eax, 4))\ 

293 
MOVNTQ(%%mm2, 8(%4, %%eax, 4))\ 

294 
MOVNTQ(%%mm1, 16(%4, %%eax, 4))\ 

295 
MOVNTQ(%%mm3, 24(%4, %%eax, 4))\ 

296 
\ 

297 
"addl $8, %%eax \n\t"\ 

298 
"cmpl %5, %%eax \n\t"\ 

299 
" jb 1b \n\t" 

300  
301 
#define WRITEBGR16 \ 

302 
"movq %%mm2, %%mm1 \n\t" /* B */\ 

303 
"movq %%mm4, %%mm3 \n\t" /* G */\ 

304 
"movq %%mm5, %%mm6 \n\t" /* R */\ 

305 
\ 

306 
"punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\ 

307 
"punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\ 

308 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\ 

309 
\ 

310 
"psrlw $3, %%mm2 \n\t"\ 

311 
"psllw $3, %%mm3 \n\t"\ 

312 
"psllw $8, %%mm5 \n\t"\ 

313 
\ 

314 
"pand g16Mask, %%mm3 \n\t"\ 

315 
"pand r16Mask, %%mm5 \n\t"\ 

316 
\ 

317 
"por %%mm3, %%mm2 \n\t"\ 

318 
"por %%mm5, %%mm2 \n\t"\ 

319 
\ 

320 
"punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\ 

321 
"punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\ 

322 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\ 

323 
\ 

324 
"psrlw $3, %%mm1 \n\t"\ 

325 
"psllw $3, %%mm4 \n\t"\ 

326 
"psllw $8, %%mm6 \n\t"\ 

327 
\ 

328 
"pand g16Mask, %%mm4 \n\t"\ 

329 
"pand r16Mask, %%mm6 \n\t"\ 

330 
\ 

331 
"por %%mm4, %%mm1 \n\t"\ 

332 
"por %%mm6, %%mm1 \n\t"\ 

333 
\ 

334 
MOVNTQ(%%mm2, (%4, %%eax, 2))\ 

335 
MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ 

336 
\ 

337 
"addl $8, %%eax \n\t"\ 

338 
"cmpl %5, %%eax \n\t"\ 

339 
" jb 1b \n\t" 

340  
341 
#define WRITEBGR15 \ 

342 
"movq %%mm2, %%mm1 \n\t" /* B */\ 

343 
"movq %%mm4, %%mm3 \n\t" /* G */\ 

344 
"movq %%mm5, %%mm6 \n\t" /* R */\ 

345 
\ 

346 
"punpcklbw %%mm7, %%mm3 \n\t" /* 0G0G0G0G */\ 

347 
"punpcklbw %%mm7, %%mm2 \n\t" /* 0B0B0B0B */\ 

348 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R */\ 

349 
\ 

350 
"psrlw $3, %%mm2 \n\t"\ 

351 
"psllw $2, %%mm3 \n\t"\ 

352 
"psllw $7, %%mm5 \n\t"\ 

353 
\ 

354 
"pand g15Mask, %%mm3 \n\t"\ 

355 
"pand r15Mask, %%mm5 \n\t"\ 

356 
\ 

357 
"por %%mm3, %%mm2 \n\t"\ 

358 
"por %%mm5, %%mm2 \n\t"\ 

359 
\ 

360 
"punpckhbw %%mm7, %%mm4 \n\t" /* 0G0G0G0G */\ 

361 
"punpckhbw %%mm7, %%mm1 \n\t" /* 0B0B0B0B */\ 

362 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R */\ 

363 
\ 

364 
"psrlw $3, %%mm1 \n\t"\ 

365 
"psllw $2, %%mm4 \n\t"\ 

366 
"psllw $7, %%mm6 \n\t"\ 

367 
\ 

368 
"pand g15Mask, %%mm4 \n\t"\ 

369 
"pand r15Mask, %%mm6 \n\t"\ 

370 
\ 

371 
"por %%mm4, %%mm1 \n\t"\ 

372 
"por %%mm6, %%mm1 \n\t"\ 

373 
\ 

374 
MOVNTQ(%%mm2, (%4, %%eax, 2))\ 

375 
MOVNTQ(%%mm1, 8(%4, %%eax, 2))\ 

376 
\ 

377 
"addl $8, %%eax \n\t"\ 

378 
"cmpl %5, %%eax \n\t"\ 

379 
" jb 1b \n\t" 

380 
// FIXME find a faster way to shuffle it to BGR24 

381 
#define WRITEBGR24 \ 

382 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ 

383 
"movq %%mm2, %%mm1 \n\t" /* B */\ 

384 
"movq %%mm5, %%mm6 \n\t" /* R */\ 

385 
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ 

386 
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ 

387 
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ 

388 
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ 

389 
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ 

390 
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ 

391 
"punpcklbw %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ 

392 
"punpckhbw %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 

393 
"punpcklbw %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ 

394 
"punpckhbw %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ 

395 
\ 

396 
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ 

397 
"psrlq $8, %%mm0 \n\t" /* 00RGB0RG 0 */\ 

398 
"pand bm00000111, %%mm4 \n\t" /* 00000RGB 0 */\ 

399 
"pand bm11111000, %%mm0 \n\t" /* 00RGB000 0.5 */\ 

400 
"por %%mm4, %%mm0 \n\t" /* 00RGBRGB 0 */\ 

401 
"movq %%mm2, %%mm4 \n\t" /* 0RGB0RGB 1 */\ 

402 
"psllq $48, %%mm2 \n\t" /* GB000000 1 */\ 

403 
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ 

404 
\ 

405 
"movq %%mm4, %%mm2 \n\t" /* 0RGB0RGB 1 */\ 

406 
"psrld $16, %%mm4 \n\t" /* 000R000R 1 */\ 

407 
"psrlq $24, %%mm2 \n\t" /* 0000RGB0 1.5 */\ 

408 
"por %%mm4, %%mm2 \n\t" /* 000RRGBR 1 */\ 

409 
"pand bm00001111, %%mm2 \n\t" /* 0000RGBR 1 */\ 

410 
"movq %%mm1, %%mm4 \n\t" /* 0RGB0RGB 2 */\ 

411 
"psrlq $8, %%mm1 \n\t" /* 00RGB0RG 2 */\ 

412 
"pand bm00000111, %%mm4 \n\t" /* 00000RGB 2 */\ 

413 
"pand bm11111000, %%mm1 \n\t" /* 00RGB000 2.5 */\ 

414 
"por %%mm4, %%mm1 \n\t" /* 00RGBRGB 2 */\ 

415 
"movq %%mm1, %%mm4 \n\t" /* 00RGBRGB 2 */\ 

416 
"psllq $32, %%mm1 \n\t" /* BRGB0000 2 */\ 

417 
"por %%mm1, %%mm2 \n\t" /* BRGBRGBR 1 */\ 

418 
\ 

419 
"psrlq $32, %%mm4 \n\t" /* 000000RG 2.5 */\ 

420 
"movq %%mm3, %%mm5 \n\t" /* 0RGB0RGB 3 */\ 

421 
"psrlq $8, %%mm3 \n\t" /* 00RGB0RG 3 */\ 

422 
"pand bm00000111, %%mm5 \n\t" /* 00000RGB 3 */\ 

423 
"pand bm11111000, %%mm3 \n\t" /* 00RGB000 3.5 */\ 

424 
"por %%mm5, %%mm3 \n\t" /* 00RGBRGB 3 */\ 

425 
"psllq $16, %%mm3 \n\t" /* RGBRGB00 3 */\ 

426 
"por %%mm4, %%mm3 \n\t" /* RGBRGBRG 2.5 */\ 

427 
\ 

428 
"leal (%%eax, %%eax, 2), %%ebx \n\t"\ 

429 
MOVNTQ(%%mm0, (%4, %%ebx))\ 

430 
MOVNTQ(%%mm2, 8(%4, %%ebx))\ 

431 
MOVNTQ(%%mm3, 16(%4, %%ebx))\ 

432 
\ 

433 
"addl $8, %%eax \n\t"\ 

434 
"cmpl %5, %%eax \n\t"\ 

435 
" jb 1b \n\t" 

436  
437  
438 
/** 

439 
* vertical scale YV12 to RGB 

440 
*/ 

441 
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, 

442 
uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp) 

443 
{ 

444 
int yalpha1=yalpha^4095; 

445 
int uvalpha1=uvalpha^4095; 

446 
int i; 

447  
448 
if(fullUVIpol) 

449 
{ 

450  
451 
#ifdef HAVE_MMX 

452 
if(dstbpp == 32) 

453 
{ 

454 
asm volatile( 

455  
456  
457 
FULL_YSCALEYUV2RGB 

458 
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG 

459 
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 

460  
461 
"movq %%mm3, %%mm1 \n\t" 

462 
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 

463 
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 

464  
465 
MOVNTQ(%%mm3, (%4, %%eax, 4)) 

466 
MOVNTQ(%%mm1, 8(%4, %%eax, 4)) 

467  
468 
"addl $4, %%eax \n\t" 

469 
"cmpl %5, %%eax \n\t" 

470 
" jb 1b \n\t" 

471  
472  
473 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

474 
"m" (yalpha1), "m" (uvalpha1) 

475 
: "%eax" 

476 
); 

477 
} 

478 
else if(dstbpp==24) 

479 
{ 

480 
asm volatile( 

481  
482 
FULL_YSCALEYUV2RGB 

483  
484 
// lsb ... msb 

485 
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG 

486 
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 

487  
488 
"movq %%mm3, %%mm1 \n\t" 

489 
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 

490 
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 

491  
492 
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0 

493 
"psrlq $8, %%mm3 \n\t" // GR0BGR00 

494 
"pand bm00000111, %%mm2 \n\t" // BGR00000 

495 
"pand bm11111000, %%mm3 \n\t" // 000BGR00 

496 
"por %%mm2, %%mm3 \n\t" // BGRBGR00 

497 
"movq %%mm1, %%mm2 \n\t" 

498 
"psllq $48, %%mm1 \n\t" // 000000BG 

499 
"por %%mm1, %%mm3 \n\t" // BGRBGRBG 

500  
501 
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0 

502 
"psrld $16, %%mm2 \n\t" // R000R000 

503 
"psrlq $24, %%mm1 \n\t" // 0BGR0000 

504 
"por %%mm2, %%mm1 \n\t" // RBGRR000 

505  
506 
"movl %4, %%ebx \n\t" 

507 
"addl %%eax, %%ebx \n\t" 

508  
509 
#ifdef HAVE_MMX2 

510 
//FIXME Alignment 

511 
"movntq %%mm3, (%%ebx, %%eax, 2)\n\t" 

512 
"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" 

513 
#else 

514 
"movd %%mm3, (%%ebx, %%eax, 2) \n\t" 

515 
"psrlq $32, %%mm3 \n\t" 

516 
"movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" 

517 
"movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" 

518 
#endif 

519 
"addl $4, %%eax \n\t" 

520 
"cmpl %5, %%eax \n\t" 

521 
" jb 1b \n\t" 

522  
523 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), 

524 
"m" (yalpha1), "m" (uvalpha1) 

525 
: "%eax", "%ebx" 

526 
); 

527 
} 

528 
else if(dstbpp==15) 

529 
{ 

530 
asm volatile( 

531  
532 
FULL_YSCALEYUV2RGB 

533 
#ifdef DITHER1XBPP 

534 
"paddusb b16Dither, %%mm1 \n\t" 

535 
"paddusb b16Dither, %%mm0 \n\t" 

536 
"paddusb b16Dither, %%mm3 \n\t" 

537 
#endif 

538 
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G 

539 
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B 

540 
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R 

541  
542 
"psrlw $3, %%mm3 \n\t" 

543 
"psllw $2, %%mm1 \n\t" 

544 
"psllw $7, %%mm0 \n\t" 

545 
"pand g15Mask, %%mm1 \n\t" 

546 
"pand r15Mask, %%mm0 \n\t" 

547  
548 
"por %%mm3, %%mm1 \n\t" 

549 
"por %%mm1, %%mm0 \n\t" 

550  
551 
MOVNTQ(%%mm0, (%4, %%eax, 2)) 

552  
553 
"addl $4, %%eax \n\t" 

554 
"cmpl %5, %%eax \n\t" 

555 
" jb 1b \n\t" 

556  
557 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

558 
"m" (yalpha1), "m" (uvalpha1) 

559 
: "%eax" 

560 
); 

561 
} 

562 
else if(dstbpp==16) 

563 
{ 

564 
asm volatile( 

565  
566 
FULL_YSCALEYUV2RGB 

567 
#ifdef DITHER1XBPP 

568 
"paddusb g16Dither, %%mm1 \n\t" 

569 
"paddusb b16Dither, %%mm0 \n\t" 

570 
"paddusb b16Dither, %%mm3 \n\t" 

571 
#endif 

572 
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G 

573 
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B 

574 
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R 

575  
576 
"psrlw $3, %%mm3 \n\t" 

577 
"psllw $3, %%mm1 \n\t" 

578 
"psllw $8, %%mm0 \n\t" 

579 
"pand g16Mask, %%mm1 \n\t" 

580 
"pand r16Mask, %%mm0 \n\t" 

581  
582 
"por %%mm3, %%mm1 \n\t" 

583 
"por %%mm1, %%mm0 \n\t" 

584  
585 
MOVNTQ(%%mm0, (%4, %%eax, 2)) 

586  
587 
"addl $4, %%eax \n\t" 

588 
"cmpl %5, %%eax \n\t" 

589 
" jb 1b \n\t" 

590  
591 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

592 
"m" (yalpha1), "m" (uvalpha1) 

593 
: "%eax" 

594 
); 

595 
} 

596 
#else 

597 
if(dstbpp==32  dstbpp==24) 

598 
{ 

599 
for(i=0;i<dstw;i++){ 

600 
// vertical linear interpolation && yuv2rgb in a single step: 

601 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 

602 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); 

603 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); 

604 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; 

605 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; 

606 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; 

607 
dest+=dstbpp>>3; 

608 
} 

609 
} 

610 
else if(dstbpp==16) 

611 
{ 

612 
for(i=0;i<dstw;i++){ 

613 
// vertical linear interpolation && yuv2rgb in a single step: 

614 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 

615 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); 

616 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); 

617  
618 
((uint16_t*)dest)[0] = 

619 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

620 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0  

621 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 

622 
dest+=2; 

623 
} 

624 
} 

625 
else if(dstbpp==15) 

626 
{ 

627 
for(i=0;i<dstw;i++){ 

628 
// vertical linear interpolation && yuv2rgb in a single step: 

629 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 

630 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19); 

631 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19); 

632  
633 
((uint16_t*)dest)[0] = 

634 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

635 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0  

636 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 

637 
dest+=2; 

638 
} 

639 
} 

640 
#endif 

641 
}//FULL_UV_IPOL 

642 
else 

643 
{ 

644 
#ifdef HAVE_MMX 

645 
if(dstbpp == 32) 

646 
{ 

647 
asm volatile( 

648 
YSCALEYUV2RGB 

649 
WRITEBGR32 

650  
651 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

652 
"m" (yalpha1), "m" (uvalpha1) 

653 
: "%eax" 

654 
); 

655 
} 

656 
else if(dstbpp==24) 

657 
{ 

658 
asm volatile( 

659 
YSCALEYUV2RGB 

660 
WRITEBGR24 

661  
662 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

663 
"m" (yalpha1), "m" (uvalpha1) 

664 
: "%eax", "%ebx" 

665 
); 

666 
} 

667 
else if(dstbpp==15) 

668 
{ 

669 
asm volatile( 

670 
YSCALEYUV2RGB 

671 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 

672 
#ifdef DITHER1XBPP 

673 
"paddusb b16Dither, %%mm2 \n\t" 

674 
"paddusb b16Dither, %%mm4 \n\t" 

675 
"paddusb b16Dither, %%mm5 \n\t" 

676 
#endif 

677  
678 
WRITEBGR15 

679  
680 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

681 
"m" (yalpha1), "m" (uvalpha1) 

682 
: "%eax" 

683 
); 

684 
} 

685 
else if(dstbpp==16) 

686 
{ 

687 
asm volatile( 

688 
YSCALEYUV2RGB 

689 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 

690 
#ifdef DITHER1XBPP 

691 
"paddusb g16Dither, %%mm2 \n\t" 

692 
"paddusb b16Dither, %%mm4 \n\t" 

693 
"paddusb b16Dither, %%mm5 \n\t" 

694 
#endif 

695  
696 
WRITEBGR16 

697  
698 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

699 
"m" (yalpha1), "m" (uvalpha1) 

700 
: "%eax" 

701 
); 

702 
} 

703 
#else 

704 
//FIXME unroll C loop and dont recalculate UV 

705 
if(dstbpp==32  dstbpp==24) 

706 
{ 

707 
for(i=0;i<dstw;i++){ 

708 
// vertical linear interpolation && yuv2rgb in a single step: 

709 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 

710 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 

711 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 

712 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; 

713 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; 

714 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; 

715 
dest+=dstbpp>>3; 

716 
} 

717 
} 

718 
else if(dstbpp==16) 

719 
{ 

720 
for(i=0;i<dstw;i++){ 

721 
// vertical linear interpolation && yuv2rgb in a single step: 

722 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 

723 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 

724 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 

725  
726 
((uint16_t*)dest)[0] = 

727 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

728 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0  

729 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 

730 
dest+=2; 

731 
} 

732 
} 

733 
else if(dstbpp==15) 

734 
{ 

735 
for(i=0;i<dstw;i++){ 

736 
// vertical linear interpolation && yuv2rgb in a single step: 

737 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)]; 

738 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 

739 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 

740  
741 
((uint16_t*)dest)[0] = 

742 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

743 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0  

744 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 

745 
dest+=2; 

746 
} 

747 
} 

748 
#endif 

749 
} //!FULL_UV_IPOL 

750 
} 

751  
752 
/** 

753 
* YV12 to RGB without scaling or interpolating 

754 
*/ 

755 
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1, 

756 
uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp) 

757 
{ 

758 
int yalpha1=yalpha^4095; 

759 
int uvalpha1=uvalpha^4095; 

760 
int i; 

761 
if(fullUVIpol  allwaysIpol) 

762 
{ 

763 
yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); 

764 
return; 

765 
} 

766 
#ifdef HAVE_MMX 

767 
if(dstbpp == 32) 

768 
{ 

769 
asm volatile( 

770 
YSCALEYUV2RGB1 

771 
WRITEBGR32 

772 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

773 
"m" (yalpha1), "m" (uvalpha1) 

774 
: "%eax" 

775 
); 

776 
} 

777 
else if(dstbpp==24) 

778 
{ 

779 
asm volatile( 

780 
YSCALEYUV2RGB1 

781 
WRITEBGR24 

782 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

783 
"m" (yalpha1), "m" (uvalpha1) 

784 
: "%eax", "%ebx" 

785 
); 

786 
} 

787 
else if(dstbpp==15) 

788 
{ 

789 
asm volatile( 

790 
YSCALEYUV2RGB1 

791 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 

792 
#ifdef DITHER1XBPP 

793 
"paddusb b16Dither, %%mm2 \n\t" 

794 
"paddusb b16Dither, %%mm4 \n\t" 

795 
"paddusb b16Dither, %%mm5 \n\t" 

796 
#endif 

797 
WRITEBGR15 

798 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

799 
"m" (yalpha1), "m" (uvalpha1) 

800 
: "%eax" 

801 
); 

802 
} 

803 
else if(dstbpp==16) 

804 
{ 

805 
asm volatile( 

806 
YSCALEYUV2RGB1 

807 
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ 

808 
#ifdef DITHER1XBPP 

809 
"paddusb g16Dither, %%mm2 \n\t" 

810 
"paddusb b16Dither, %%mm4 \n\t" 

811 
"paddusb b16Dither, %%mm5 \n\t" 

812 
#endif 

813  
814 
WRITEBGR16 

815 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

816 
"m" (yalpha1), "m" (uvalpha1) 

817 
: "%eax" 

818 
); 

819 
} 

820 
#else 

821 
//FIXME unroll C loop and dont recalculate UV 

822 
if(dstbpp==32  dstbpp==24) 

823 
{ 

824 
for(i=0;i<dstw;i++){ 

825 
// vertical linear interpolation && yuv2rgb in a single step: 

826 
int Y=yuvtab_2568[buf0[i]>>7]; 

827 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 

828 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 

829 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; 

830 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; 

831 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; 

832 
dest+=dstbpp>>3; 

833 
} 

834 
} 

835 
else if(dstbpp==16) 

836 
{ 

837 
for(i=0;i<dstw;i++){ 

838 
// vertical linear interpolation && yuv2rgb in a single step: 

839 
int Y=yuvtab_2568[buf0[i]>>7]; 

840 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 

841 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 

842  
843 
((uint16_t*)dest)[0] = 

844 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

845 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0  

846 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 

847 
dest+=2; 

848 
} 

849 
} 

850 
else if(dstbpp==15) 

851 
{ 

852 
for(i=0;i<dstw;i++){ 

853 
// vertical linear interpolation && yuv2rgb in a single step: 

854 
int Y=yuvtab_2568[buf0[i]>>7]; 

855 
int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19); 

856 
int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19); 

857  
858 
((uint16_t*)dest)[0] = 

859 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

860 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0  

861 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 

862 
dest+=2; 

863 
} 

864 
} 

865 
#endif 

866 
} 

867  
868  
869  
69  870  
70  871 
// *** bilinear scaling and yuv>rgb conversion of yv12 slices: 
71  872 
// *** Note: it's called multiple times while decoding a frame, first time y==0 
...  ...  
95  896 
// used to detect a horizontal size change 
96  897 
static int old_dstw= 1; 
97  898 
static int old_s_xinc= 1; 
98  
99  899 
#endif 
900  
100  901 
int canMMX2BeUsed=0; 
101  902 
int srcWidth= (dstw*s_xinc + 0x8000)>>16; 
903 
int dstUVw= fullUVIpol ? dstw : dstw/2; 

904  
102  905  
103  906 
#ifdef HAVE_MMX2 
104  907 
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0; 
...  ...  
111  914 
// first and last pixel 
112  915 
if(canMMX2BeUsed) s_xinc+= 20; 
113  916 
else s_xinc = ((srcWidth2)<<16)/(dstw2)  20; 
114 
s_xinc2=s_xinc>>1; 

115  917  
918 
if(fullUVIpol) s_xinc2= s_xinc>>1; 

919 
else s_xinc2= s_xinc; 

116  920 
// force calculation of the horizontal interpolation of the first line 
117  921 
s_last_ypos=99; 
118  922 
s_last_y1pos=99; 
...  ...  
215  1019 
funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]= 
216  1020 
a  (b<<2)  (c<<4)  (d<<6); 
217  1021  
1022 
// if we dont need to read 8 bytes than dont :), reduces the chance of 

1023 
// crossing a cache line 

1024 
if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E; 

1025  
218  1026 
funnyYCode[fragmentLength*(i+4)/4]= RET; 
219  1027 
} 
220  1028 
xpos+=s_xinc; 
221  1029 
} 
222  1030  
223  1031 
xpos= 0; //s_xinc2/2  0x10000; // difference between centers of chrom samples 
224 
for(i=0; i<dstw/8; i++) 

1032 
for(i=0; i<dstUVw/8; i++)


225  1033 
{ 
226  1034 
int xx=xpos>>16; 
227  1035  
...  ...  
238  1046 
funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]= 
239  1047 
a  (b<<2)  (c<<4)  (d<<6); 
240  1048  
1049 
// if we dont need to read 8 bytes than dont :), reduces the chance of 

1050 
// crossing a cache line 

1051 
if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E; 

1052  
241  1053 
funnyUVCode[fragmentLength*(i+4)/4]= RET; 
242  1054 
} 
243  1055 
xpos+=s_xinc2; 
...  ...  
255  1067 
// points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src) 
256  1068 
int srcuvpos= s_srcypos + s_yinc/2  0x8000; 
257  1069 
int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line 
258 
int yalpha=((s_srcypos1)&0xFFFF)>>7; 

259 
int yalpha1=yalpha^511; 

260 
int uvalpha=((srcuvpos1)&0x1FFFF)>>8; 

261 
int uvalpha1=uvalpha^511; 

1070 
int yalpha=((s_srcypos1)&0xFFFF)>>4; 

1071 
int uvalpha=((srcuvpos1)&0x1FFFF)>>5; 

262  1072 
uint16_t *buf0=pix_buf_y[y0&1]; // top line of the interpolated slice 
263  1073 
uint16_t *buf1=pix_buf_y[((y0+1)&1)]; // bottom line of the interpolated slice 
264  1074 
uint16_t *uvbuf0=pix_buf_uv[y1&1]; // top line of the interpolated slice 
...  ...  
320  1130 
"xorl %%ecx, %%ecx \n\t" 
321  1131 
"xorl %%ebx, %%ebx \n\t" 
322  1132 
"movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF 
323 
// "int $3\n\t" 

324 
"call funnyYCode \n\t" 

325 
"movq temp0, %%mm2 \n\t" 

326 
"xorl %%ecx, %%ecx \n\t" 

327 
"call funnyYCode \n\t" 

328 
"movq temp0, %%mm2 \n\t" 

329 
"xorl %%ecx, %%ecx \n\t" 

330 
"call funnyYCode \n\t" 

331 
"movq temp0, %%mm2 \n\t" 

332 
"xorl %%ecx, %%ecx \n\t" 

333 
"call funnyYCode \n\t" 

334 
"movq temp0, %%mm2 \n\t" 

335 
"xorl %%ecx, %%ecx \n\t" 

336 
"call funnyYCode \n\t" 

337 
"movq temp0, %%mm2 \n\t" 

338 
"xorl %%ecx, %%ecx \n\t" 

339 
"call funnyYCode \n\t" 

340 
"movq temp0, %%mm2 \n\t" 

1133 
#ifdef HAVE_MMX2 

1134 
#define FUNNY_Y_CODE \ 

1135 
"prefetchnta 1024(%%esi) \n\t"\ 

1136 
"prefetchnta 1056(%%esi) \n\t"\ 

1137 
"prefetchnta 1088(%%esi) \n\t"\ 

1138 
"call funnyYCode \n\t"\ 

1139 
"movq temp0, %%mm2 \n\t"\ 

341  1140 
"xorl %%ecx, %%ecx \n\t" 
342 
"call funnyYCode \n\t" 

343 
"movq temp0, %%mm2 \n\t" 

1141 
#else 

1142 
#define FUNNY_Y_CODE \ 

1143 
"call funnyYCode \n\t"\ 

1144 
"movq temp0, %%mm2 \n\t"\ 

344  1145 
"xorl %%ecx, %%ecx \n\t" 
345 
"call funnyYCode \n\t" 

1146 
#endif 

1147 
FUNNY_Y_CODE 

1148 
FUNNY_Y_CODE 

1149 
FUNNY_Y_CODE 

1150 
FUNNY_Y_CODE 

1151 
FUNNY_Y_CODE 

1152 
FUNNY_Y_CODE 

1153 
FUNNY_Y_CODE 

1154 
FUNNY_Y_CODE 

1155  
346  1156 
:: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16), 
347  1157 
"m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF) 
348  1158 
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 
...  ...  
352  1162 
else 
353  1163 
{ 
354  1164 
#endif 
355 
//NO MMX just normal asm ... FIXME try/write funny MMX2 variant 

356 
//FIXME add prefetch 

1165 
//NO MMX just normal asm ... 

357  1166 
asm volatile( 
358  1167 
"xorl %%eax, %%eax \n\t" // i 
359  1168 
"xorl %%ebx, %%ebx \n\t" // xx 
...  ...  
438  1247 
"xorl %%ebx, %%ebx \n\t" 
439  1248 
"movw %4, %%bx \n\t" // (s_xinc*4)&0xFFFF 
440  1249  
441 
// "int $3\n\t"


1250 
#ifdef HAVE_MMX2


442  1251 
#define FUNNYUVCODE \ 
443 
"call funnyUVCode \n\t"\ 

444 
"movq temp0, %%mm2 \n\t"\ 

445 
"xorl %%ecx, %%ecx \n\t" 

1252 
"prefetchnta 1024(%%esi) \n\t"\ 

1253 
"prefetchnta 1056(%%esi) \n\t"\ 

1254 
"prefetchnta 1088(%%esi) \n\t"\ 

1255 
"call funnyUVCode \n\t"\ 

1256 
"movq temp0, %%mm2 \n\t"\ 

1257 
"xorl %%ecx, %%ecx \n\t" 

1258 
#else 

1259 
#define FUNNYUVCODE \ 

1260 
"call funnyUVCode \n\t"\ 

1261 
"movq temp0, %%mm2 \n\t"\ 

1262 
"xorl %%ecx, %%ecx \n\t" 

1263 
#endif 

446  1264  
447  1265 
FUNNYUVCODE 
448  1266 
FUNNYUVCODE 
...  ...  
455  1273 
FUNNYUVCODE 
456  1274  
457  1275  
458  
459  1276 
"xorl %%eax, %%eax \n\t" // i 
460  1277 
"movl %6, %%esi \n\t" // src 
461  1278 
"movl %1, %%edi \n\t" // buf1 
...  ...  
471  1288 
FUNNYUVCODE 
472  1289 
FUNNYUVCODE 
473  1290  
474 
:: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16), 

1291 
:: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16),


475  1292 
"m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2) 
476  1293 
: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" 
477  1294 
); 
478 
for(i=dstw1; (i*s_xinc2)>>16 >=srcWidth/21; i) 

1295 
for(i=dstUVw1; (i*s_xinc2)>>16 >=srcWidth/21; i)


479  1296 
{ 
480  1297 
uvbuf1[i] = src1[srcWidth/21]*128; 
481  1298 
uvbuf1[i+2048] = src2[srcWidth/21]*128; 
...  ...  
516  1333 
"cmpl %2, %%eax \n\t" 
517  1334 
" jb 1b \n\t" 
518  1335  
519  
520 
:: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), 

1336 
:: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF), 

521  1337 
"r" (src2) 
522  1338 
: "%eax", "%ebx", "%ecx", "%edi", "%esi" 
523  1339 
); 
...  ...  
525  1341 
} //if MMX2 cant be used 
526  1342 
#endif 
527  1343 
#else 
528 
for(i=0;i<dstw;i++){ 

1344 
for(i=0;i<dstUVw;i++){


529  1345 
register unsigned int xx=xpos>>16; 
530  1346 
register unsigned int xalpha=(xpos&0xFFFF)>>9; 
531  1347 
uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha); 
...  ...  
541  1357 
} 
542  1358 
} 
543  1359  
1360 
if(ABS(s_yinc  0x10000) < 10) 

1361 
yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); 

1362 
else 

1363 
yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp); 

544  1364  
545 
// Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization... 

546 
// Re: Note1: ok n*4 for now 

547 
// Note2: instead of using lookup tabs, mmx version could do the multiply... 

548 
// Re: Note2: yep 

549 
// Note3: maybe we should make separated 15/16, 24 and 32bpp version of this: 

550 
// Re: done (32 & 16) and 16 has dithering :) but 16 is untested 

551  1365 
#ifdef HAVE_MMX 
552 
//FIXME write lq version with less uv ... 

553 
//FIXME reorder / optimize 

554 
if(dstbpp == 32) 

555 
{ 

556 
asm volatile( 

557  
558 
#define YSCALEYUV2RGB \ 

559 
"pxor %%mm7, %%mm7 \n\t"\ 

560 
"movd %6, %%mm6 \n\t" /*yalpha1*/\ 

561 
"punpcklwd %%mm6, %%mm6 \n\t"\ 

562 
"punpcklwd %%mm6, %%mm6 \n\t"\ 

563 
"movd %7, %%mm5 \n\t" /*uvalpha1*/\ 

564 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

565 
"punpcklwd %%mm5, %%mm5 \n\t"\ 

566 
"xorl %%eax, %%eax \n\t"\ 

567 
"1: \n\t"\ 

568 
"movq (%0, %%eax, 2), %%mm0 \n\t" /*buf0[eax]*/\ 

569 
"movq (%1, %%eax, 2), %%mm1 \n\t" /*buf1[eax]*/\ 

570 
"movq (%2, %%eax,2), %%mm2 \n\t" /* uvbuf0[eax]*/\ 

571 
"movq (%3, %%eax,2), %%mm3 \n\t" /* uvbuf1[eax]*/\ 

572 
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax]  buf1[eax]*/\ 

573 
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax]  uvbuf1[eax]*/\ 

574 
"pmulhw %%mm6, %%mm0 \n\t" /* (buf0[eax]  buf1[eax])yalpha1>>16*/\ 

575 
"pmulhw %%mm5, %%mm2 \n\t" /* (uvbuf0[eax]  uvbuf1[eax])uvalpha1>>16*/\ 

576 
"psraw $7, %%mm1 \n\t" /* buf0[eax]  buf1[eax] >>7*/\ 

577 
"movq 4096(%2, %%eax,2), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ 

578 
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax]  uvbuf1[eax] >>7*/\ 

579 
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1yalpha1) >>16*/\ 

580 
"movq 4096(%3, %%eax,2), %%mm0 \n\t" /* uvbuf1[eax+2048]*/\ 

581 
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1  uvbuf1[eax](1uvalpha1)*/\ 

582 
"psubw %%mm0, %%mm4 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048]*/\ 

583 
"psubw w10, %%mm1 \n\t" /* Y16*/\ 

584 
"psubw w80, %%mm3 \n\t" /* (U128)*/\ 

585 
"psllw $3, %%mm1 \n\t" /* (y16)*8*/\ 

586 
"psllw $3, %%mm3 \n\t" /*(U128)8*/\ 

587 
"pmulhw yCoeff, %%mm1 \n\t"\ 

588 
\ 

589 
\ 

590 
"pmulhw %%mm5, %%mm4 \n\t" /* (uvbuf0[eax+2048]  uvbuf1[eax+2048])uvalpha1>>16*/\ 

591 
"movq %%mm3, %%mm2 \n\t" /* (U128)8*/\ 

592 
"pmulhw ubCoeff, %%mm3 \n\t"\ 

593 
"psraw $7, %%mm0 \n\t" /* uvbuf0[eax+2048]  uvbuf1[eax+2048] >>7*/\ 

594 
"pmulhw ugCoeff, %%mm2 \n\t"\ 

595 
"paddw %%mm4, %%mm0 \n\t" /* uvbuf0[eax+2048]uvalpha1  uvbuf1[eax+2048](1uvalpha1)*/\ 

596 
"psubw w80, %%mm0 \n\t" /* (V128)*/\ 

597 
"psllw $3, %%mm0 \n\t" /* (V128)8*/\ 

598 
\ 

599 
\ 

600 
"movq %%mm0, %%mm4 \n\t" /* (V128)8*/\ 

601 
"pmulhw vrCoeff, %%mm0 \n\t"\ 

602 
"pmulhw vgCoeff, %%mm4 \n\t"\ 

603 
"paddw %%mm1, %%mm3 \n\t" /* B*/\ 

604 
"paddw %%mm1, %%mm0 \n\t" /* R*/\ 

605 
"packuswb %%mm3, %%mm3 \n\t"\ 

606 
\ 

607 
"packuswb %%mm0, %%mm0 \n\t"\ 

608 
"paddw %%mm4, %%mm2 \n\t"\ 

609 
"paddw %%mm2, %%mm1 \n\t" /* G*/\ 

610 
\ 

611 
"packuswb %%mm1, %%mm1 \n\t" 

612  
613 
YSCALEYUV2RGB 

614 
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG 

615 
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 

616  
617 
"movq %%mm3, %%mm1 \n\t" 

618 
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 

619 
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 

620 
#ifdef HAVE_MMX2 

621 
"movntq %%mm3, (%4, %%eax, 4) \n\t" 

622 
"movntq %%mm1, 8(%4, %%eax, 4) \n\t" 

623 
#else 

624 
"movq %%mm3, (%4, %%eax, 4) \n\t" 

625 
"movq %%mm1, 8(%4, %%eax, 4) \n\t" 

626 
#endif 

627 
"addl $4, %%eax \n\t" 

628 
"cmpl %5, %%eax \n\t" 

629 
" jb 1b \n\t" 

630  
631  
632 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

633 
"m" (yalpha1), "m" (uvalpha1) 

634 
: "%eax" 

635 
); 

636 
} 

637 
else if(dstbpp==24) 

638 
{ 

639 
asm volatile( 

640  
641 
YSCALEYUV2RGB 

642  
643 
// lsb ... msb 

644 
"punpcklbw %%mm1, %%mm3 \n\t" // BGBGBGBG 

645 
"punpcklbw %%mm7, %%mm0 \n\t" // R0R0R0R0 

646  
647 
"movq %%mm3, %%mm1 \n\t" 

648 
"punpcklwd %%mm0, %%mm3 \n\t" // BGR0BGR0 

649 
"punpckhwd %%mm0, %%mm1 \n\t" // BGR0BGR0 

650  
651 
"movq %%mm3, %%mm2 \n\t" // BGR0BGR0 

652 
"psrlq $8, %%mm3 \n\t" // GR0BGR00 

653 
"pand bm00000111, %%mm2 \n\t" // BGR00000 

654 
"pand bm11111000, %%mm3 \n\t" // 000BGR00 

655 
"por %%mm2, %%mm3 \n\t" // BGRBGR00 

656 
"movq %%mm1, %%mm2 \n\t" 

657 
"psllq $48, %%mm1 \n\t" // 000000BG 

658 
"por %%mm1, %%mm3 \n\t" // BGRBGRBG 

659  
660 
"movq %%mm2, %%mm1 \n\t" // BGR0BGR0 

661 
"psrld $16, %%mm2 \n\t" // R000R000 

662 
"psrlq $24, %%mm1 \n\t" // 0BGR0000 

663 
"por %%mm2, %%mm1 \n\t" // RBGRR000 

664  
665 
"movl %4, %%ebx \n\t" 

666 
"addl %%eax, %%ebx \n\t" 

667 
#ifdef HAVE_MMX2 

668 
//FIXME Alignment 

669 
"movntq %%mm3, (%%ebx, %%eax, 2)\n\t" 

670 
"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t" 

671 
#else 

672 
"movd %%mm3, (%%ebx, %%eax, 2) \n\t" 

673 
"psrlq $32, %%mm3 \n\t" 

674 
"movd %%mm3, 4(%%ebx, %%eax, 2) \n\t" 

675 
"movd %%mm1, 8(%%ebx, %%eax, 2) \n\t" 

676 
#endif 

677 
"addl $4, %%eax \n\t" 

678 
"cmpl %5, %%eax \n\t" 

679 
" jb 1b \n\t" 

680  
681 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw), 

682 
"m" (yalpha1), "m" (uvalpha1) 

683 
: "%eax", "%ebx" 

684 
); 

685 
} 

686 
else if(dstbpp==16) 

687 
{ 

688 
asm volatile( 

689  
690 
YSCALEYUV2RGB 

691 
#ifdef DITHER16BPP 

692 
"paddusb g16Dither, %%mm1 \n\t" 

693 
"paddusb b16Dither, %%mm0 \n\t" 

694 
"paddusb b16Dither, %%mm3 \n\t" 

695 
#endif 

696 
"punpcklbw %%mm7, %%mm1 \n\t" // 0G0G0G0G 

697 
"punpcklbw %%mm7, %%mm3 \n\t" // 0B0B0B0B 

698 
"punpcklbw %%mm7, %%mm0 \n\t" // 0R0R0R0R 

699  
700 
"psrlw $3, %%mm3 \n\t" 

701 
"psllw $3, %%mm1 \n\t" 

702 
"psllw $8, %%mm0 \n\t" 

703 
"pand g16Mask, %%mm1 \n\t" 

704 
"pand r16Mask, %%mm0 \n\t" 

705  
706 
"por %%mm3, %%mm1 \n\t" 

707 
"por %%mm1, %%mm0 \n\t" 

708 
#ifdef HAVE_MMX2 

709 
"movntq %%mm0, (%4, %%eax, 2) \n\t" 

710 
#else 

711 
"movq %%mm0, (%4, %%eax, 2) \n\t" 

712 
#endif 

713 
"addl $4, %%eax \n\t" 

714 
"cmpl %5, %%eax \n\t" 

715 
" jb 1b \n\t" 

716  
717 
:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw), 

718 
"m" (yalpha1), "m" (uvalpha1) 

719 
: "%eax" 

720 
); 

721 
} 

722 
#else 

723 
if(dstbpp==32  dstbpp==24) 

724 
{ 

725 
for(i=0;i<dstw;i++){ 

726 
// vertical linear interpolation && yuv2rgb in a single step: 

727 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; 

728 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); 

729 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); 

730 
dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)]; 

731 
dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]; 

732 
dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)]; 

733 
dest+=dstbpp>>3; 

734 
} 

735 
} 

736 
else if(dstbpp==16) 

737 
{ 

738 
for(i=0;i<dstw;i++){ 

739 
// vertical linear interpolation && yuv2rgb in a single step: 

740 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; 

741 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); 

742 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); 

743  
744 
((uint16_t*)dest)[0] = 

745 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

746 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0  

747 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800; 

748 
dest+=2; 

749 
} 

750 
} 

751 
else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16? 

752 
{ 

753 
for(i=0;i<dstw;i++){ 

754 
// vertical linear interpolation && yuv2rgb in a single step: 

755 
int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)]; 

756 
int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16); 

757 
int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16); 

758  
759 
((uint16_t*)dest)[0] = 

760 
(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3)  

761 
(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0  

762 
(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00; 

763 
dest+=2; 

764 
} 

765 
} 

766 
#endif 

767  
768 
b16Dither= b16Dither1; 

1366 
b16Dither= b16Dither1; 

769  1367 
b16Dither1= b16Dither2; 
770  1368 
b16Dither2= b16Dither; 
771  1369  
772  1370 
g16Dither= g16Dither1; 
773  1371 
g16Dither1= g16Dither2; 
774  1372 
g16Dither2= g16Dither; 
1373 
#endif 

775  1374 
} 
776  1375  
777  1376 
#ifdef HAVE_3DNOW 
Also available in: Unified diff