Revision d604bab9

View differences:

postproc/swscale.c
3 3

  
4 4
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 5
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
// the parts written by michael are under GNU GPL
6 7

  
7 8
#include <inttypes.h>
8 9
#include "../config.h"
10
#include "swscale.h"
9 11

  
10 12
//#undef HAVE_MMX2
11 13
//#undef HAVE_MMX
12 14
//#undef ARCH_X86
13
#define DITHER16BPP
14
//#define ALT_ERROR
15
#define DITHER1XBPP
16
int fullUVIpol=0;
17
//disables the unscaled height version
18
int allwaysIpol=0;
15 19

  
16 20
#define RET 0xC3 //near return opcode
17 21
/*
18 22
NOTES
19 23

  
20
known BUGS with known cause (no bugreports please!)
21
code reads 1 sample too much (might cause a sig11)
24
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
25
horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
26

  
27
Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested)
28
BGR15 & BGR16 MMX verions support dithering
29
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
22 30

  
23 31
TODO
24
check alignment off everything
32
more intelligent missalignment avoidance for the horizontal scaler
25 33
*/
26 34

  
27
static uint64_t yCoeff=    0x2568256825682568LL;
28
static uint64_t ubCoeff=   0x3343334333433343LL;
29
static uint64_t vrCoeff=   0x40cf40cf40cf40cfLL;
30
static uint64_t ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
31
static uint64_t vgCoeff=   0xF36EF36EF36EF36ELL;
32
static uint64_t w80=       0x0080008000800080LL;
33
static uint64_t w10=       0x0010001000100010LL;
34
static uint64_t bm00000111=0x0000000000FFFFFFLL;
35
static uint64_t bm11111000=0xFFFFFFFFFF000000LL;
36

  
37
static uint64_t b16Dither= 0x0004000400040004LL;
38
static uint64_t b16Dither1=0x0004000400040004LL;
39
static uint64_t b16Dither2=0x0602060206020602LL;
40
static uint64_t g16Dither= 0x0002000200020002LL;
41
static uint64_t g16Dither1=0x0002000200020002LL;
42
static uint64_t g16Dither2=0x0301030103010301LL;
43

  
44
static uint64_t b16Mask=   0x001F001F001F001FLL;
45
static uint64_t g16Mask=   0x07E007E007E007E0LL;
46
static uint64_t r16Mask=   0xF800F800F800F800LL;
47
static uint64_t temp0;
35
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
36

  
37
#ifdef HAVE_MMX2
38
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39
#elif defined (HAVE_3DNOW)
40
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41
#endif
48 42

  
43
#ifdef HAVE_MMX2
44
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45
#else
46
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47
#endif
48

  
49

  
50
#ifdef HAVE_MMX
51
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
52
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x3343334333433343LL;
53
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x40cf40cf40cf40cfLL;
54
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
55
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xF36EF36EF36EF36ELL;
56
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
57
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
58
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
59
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
60
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
61
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
62

  
63
static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;
64
static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;
65
static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;
66
static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;
67
static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;
68
static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;
69

  
70
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
71
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
72
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
73
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
74
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
75
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
76

  
77
static uint64_t __attribute__((aligned(8))) temp0;
78
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
79
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
80
#endif
49 81

  
50 82
// temporary storage for 4 yuv lines:
51 83
// 16bit for now (mmx likes it more compact)
84
#ifdef HAVE_MMX
85
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
86
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
87
#else
52 88
static uint16_t pix_buf_y[4][2048];
53 89
static uint16_t pix_buf_uv[2][2048*2];
90
#endif
54 91

  
55 92
// clipping helper table for C implementations:
56 93
static unsigned char clip_table[768];
......
66 103
static uint8_t funnyYCode[10000];
67 104
static uint8_t funnyUVCode[10000];
68 105

  
106
#define FULL_YSCALEYUV2RGB \
107
		"pxor %%mm7, %%mm7		\n\t"\
108
		"movd %6, %%mm6			\n\t" /*yalpha1*/\
109
		"punpcklwd %%mm6, %%mm6		\n\t"\
110
		"punpcklwd %%mm6, %%mm6		\n\t"\
111
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
112
		"punpcklwd %%mm5, %%mm5		\n\t"\
113
		"punpcklwd %%mm5, %%mm5		\n\t"\
114
		"xorl %%eax, %%eax		\n\t"\
115
		"1:				\n\t"\
116
		"movq (%0, %%eax, 2), %%mm0	\n\t" /*buf0[eax]*/\
117
		"movq (%1, %%eax, 2), %%mm1	\n\t" /*buf1[eax]*/\
118
		"movq (%2, %%eax,2), %%mm2	\n\t" /* uvbuf0[eax]*/\
119
		"movq (%3, %%eax,2), %%mm3	\n\t" /* uvbuf1[eax]*/\
120
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
121
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
122
		"pmulhw %%mm6, %%mm0		\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
123
		"pmulhw %%mm5, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
124
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
125
		"movq 4096(%2, %%eax,2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
126
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
127
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
128
		"movq 4096(%3, %%eax,2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
129
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
130
		"psubw %%mm0, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
131
		"psubw w80, %%mm1		\n\t" /* 8(Y-16)*/\
132
		"psubw w400, %%mm3		\n\t" /* 8(U-128)*/\
133
		"pmulhw yCoeff, %%mm1		\n\t"\
134
\
135
\
136
		"pmulhw %%mm5, %%mm4		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
137
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
138
		"pmulhw ubCoeff, %%mm3		\n\t"\
139
		"psraw $4, %%mm0		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
140
		"pmulhw ugCoeff, %%mm2		\n\t"\
141
		"paddw %%mm4, %%mm0		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
142
		"psubw w400, %%mm0		\n\t" /* (V-128)8*/\
143
\
144
\
145
		"movq %%mm0, %%mm4		\n\t" /* (V-128)8*/\
146
		"pmulhw vrCoeff, %%mm0		\n\t"\
147
		"pmulhw vgCoeff, %%mm4		\n\t"\
148
		"paddw %%mm1, %%mm3		\n\t" /* B*/\
149
		"paddw %%mm1, %%mm0		\n\t" /* R*/\
150
		"packuswb %%mm3, %%mm3		\n\t"\
151
\
152
		"packuswb %%mm0, %%mm0		\n\t"\
153
		"paddw %%mm4, %%mm2		\n\t"\
154
		"paddw %%mm2, %%mm1		\n\t" /* G*/\
155
\
156
		"packuswb %%mm1, %%mm1		\n\t"
157

  
158
#define YSCALEYUV2RGB \
159
		"movd %6, %%mm6			\n\t" /*yalpha1*/\
160
		"punpcklwd %%mm6, %%mm6		\n\t"\
161
		"punpcklwd %%mm6, %%mm6		\n\t"\
162
		"movq %%mm6, asm_yalpha1	\n\t"\
163
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
164
		"punpcklwd %%mm5, %%mm5		\n\t"\
165
		"punpcklwd %%mm5, %%mm5		\n\t"\
166
		"movq %%mm5, asm_uvalpha1	\n\t"\
167
		"xorl %%eax, %%eax		\n\t"\
168
		"1:				\n\t"\
169
		"movq (%2, %%eax), %%mm2	\n\t" /* uvbuf0[eax]*/\
170
		"movq (%3, %%eax), %%mm3	\n\t" /* uvbuf1[eax]*/\
171
		"movq 4096(%2, %%eax), %%mm5	\n\t" /* uvbuf0[eax+2048]*/\
172
		"movq 4096(%3, %%eax), %%mm4	\n\t" /* uvbuf1[eax+2048]*/\
173
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
174
		"psubw %%mm4, %%mm5		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
175
		"movq asm_uvalpha1, %%mm0	\n\t"\
176
		"pmulhw %%mm0, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
177
		"pmulhw %%mm0, %%mm5		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
178
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
179
		"psraw $4, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
180
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
181
		"paddw %%mm5, %%mm4		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
182
		"psubw w400, %%mm3		\n\t" /* (U-128)8*/\
183
		"psubw w400, %%mm4		\n\t" /* (V-128)8*/\
184
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
185
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
186
		"pmulhw ugCoeff, %%mm3		\n\t"\
187
		"pmulhw vgCoeff, %%mm4		\n\t"\
188
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
189
		"movq (%0, %%eax, 2), %%mm0	\n\t" /*buf0[eax]*/\
190
		"movq (%1, %%eax, 2), %%mm1	\n\t" /*buf1[eax]*/\
191
		"movq 8(%0, %%eax, 2), %%mm6	\n\t" /*buf0[eax]*/\
192
		"movq 8(%1, %%eax, 2), %%mm7	\n\t" /*buf1[eax]*/\
193
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
194
		"psubw %%mm7, %%mm6		\n\t" /* buf0[eax] - buf1[eax]*/\
195
		"pmulhw asm_yalpha1, %%mm0	\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
196
		"pmulhw asm_yalpha1, %%mm6	\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
197
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
198
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
199
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
200
		"paddw %%mm6, %%mm7		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
201
		"pmulhw ubCoeff, %%mm2		\n\t"\
202
		"pmulhw vrCoeff, %%mm5		\n\t"\
203
		"psubw w80, %%mm1		\n\t" /* 8(Y-16)*/\
204
		"psubw w80, %%mm7		\n\t" /* 8(Y-16)*/\
205
		"pmulhw yCoeff, %%mm1		\n\t"\
206
		"pmulhw yCoeff, %%mm7		\n\t"\
207
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
208
		"paddw %%mm3, %%mm4		\n\t"\
209
		"movq %%mm2, %%mm0		\n\t"\
210
		"movq %%mm5, %%mm6		\n\t"\
211
		"movq %%mm4, %%mm3		\n\t"\
212
		"punpcklwd %%mm2, %%mm2		\n\t"\
213
		"punpcklwd %%mm5, %%mm5		\n\t"\
214
		"punpcklwd %%mm4, %%mm4		\n\t"\
215
		"paddw %%mm1, %%mm2		\n\t"\
216
		"paddw %%mm1, %%mm5		\n\t"\
217
		"paddw %%mm1, %%mm4		\n\t"\
218
		"punpckhwd %%mm0, %%mm0		\n\t"\
219
		"punpckhwd %%mm6, %%mm6		\n\t"\
220
		"punpckhwd %%mm3, %%mm3		\n\t"\
221
		"paddw %%mm7, %%mm0		\n\t"\
222
		"paddw %%mm7, %%mm6		\n\t"\
223
		"paddw %%mm7, %%mm3		\n\t"\
224
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
225
		"packuswb %%mm0, %%mm2		\n\t"\
226
		"packuswb %%mm6, %%mm5		\n\t"\
227
		"packuswb %%mm3, %%mm4		\n\t"\
228
		"pxor %%mm7, %%mm7		\n\t"
229

  
230
#define YSCALEYUV2RGB1 \
231
		"xorl %%eax, %%eax		\n\t"\
232
		"1:				\n\t"\
233
		"movq (%2, %%eax), %%mm3	\n\t" /* uvbuf0[eax]*/\
234
		"movq 4096(%2, %%eax), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
235
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
236
		"psraw $4, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
237
		"psubw w400, %%mm3		\n\t" /* (U-128)8*/\
238
		"psubw w400, %%mm4		\n\t" /* (V-128)8*/\
239
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
240
		"movq %%mm4, %%mm5		\n\t" /* (V-128)8*/\
241
		"pmulhw ugCoeff, %%mm3		\n\t"\
242
		"pmulhw vgCoeff, %%mm4		\n\t"\
243
	/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
244
		"movq (%1, %%eax, 2), %%mm1	\n\t" /*buf0[eax]*/\
245
		"movq 8(%1, %%eax, 2), %%mm7	\n\t" /*buf0[eax]*/\
246
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
247
		"psraw $4, %%mm7		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
248
		"pmulhw ubCoeff, %%mm2		\n\t"\
249
		"pmulhw vrCoeff, %%mm5		\n\t"\
250
		"psubw w80, %%mm1		\n\t" /* 8(Y-16)*/\
251
		"psubw w80, %%mm7		\n\t" /* 8(Y-16)*/\
252
		"pmulhw yCoeff, %%mm1		\n\t"\
253
		"pmulhw yCoeff, %%mm7		\n\t"\
254
	/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
255
		"paddw %%mm3, %%mm4		\n\t"\
256
		"movq %%mm2, %%mm0		\n\t"\
257
		"movq %%mm5, %%mm6		\n\t"\
258
		"movq %%mm4, %%mm3		\n\t"\
259
		"punpcklwd %%mm2, %%mm2		\n\t"\
260
		"punpcklwd %%mm5, %%mm5		\n\t"\
261
		"punpcklwd %%mm4, %%mm4		\n\t"\
262
		"paddw %%mm1, %%mm2		\n\t"\
263
		"paddw %%mm1, %%mm5		\n\t"\
264
		"paddw %%mm1, %%mm4		\n\t"\
265
		"punpckhwd %%mm0, %%mm0		\n\t"\
266
		"punpckhwd %%mm6, %%mm6		\n\t"\
267
		"punpckhwd %%mm3, %%mm3		\n\t"\
268
		"paddw %%mm7, %%mm0		\n\t"\
269
		"paddw %%mm7, %%mm6		\n\t"\
270
		"paddw %%mm7, %%mm3		\n\t"\
271
		/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
272
		"packuswb %%mm0, %%mm2		\n\t"\
273
		"packuswb %%mm6, %%mm5		\n\t"\
274
		"packuswb %%mm3, %%mm4		\n\t"\
275
		"pxor %%mm7, %%mm7		\n\t"
276

  
277
#define WRITEBGR32 \
278
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
279
			"movq %%mm2, %%mm1		\n\t" /* B */\
280
			"movq %%mm5, %%mm6		\n\t" /* R */\
281
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
282
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
283
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
284
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
285
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
286
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
287
			"punpcklwd %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
288
			"punpckhwd %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
289
			"punpcklwd %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
290
			"punpckhwd %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
291
\
292
			MOVNTQ(%%mm0, (%4, %%eax, 4))\
293
			MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
294
			MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
295
			MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
296
\
297
			"addl $8, %%eax			\n\t"\
298
			"cmpl %5, %%eax			\n\t"\
299
			" jb 1b				\n\t"
300

  
301
#define WRITEBGR16 \
302
			"movq %%mm2, %%mm1		\n\t" /* B */\
303
			"movq %%mm4, %%mm3		\n\t" /* G */\
304
			"movq %%mm5, %%mm6		\n\t" /* R */\
305
\
306
			"punpcklbw %%mm7, %%mm3		\n\t" /* 0G0G0G0G */\
307
			"punpcklbw %%mm7, %%mm2		\n\t" /* 0B0B0B0B */\
308
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R */\
309
\
310
			"psrlw $3, %%mm2		\n\t"\
311
			"psllw $3, %%mm3		\n\t"\
312
			"psllw $8, %%mm5		\n\t"\
313
\
314
			"pand g16Mask, %%mm3		\n\t"\
315
			"pand r16Mask, %%mm5		\n\t"\
316
\
317
			"por %%mm3, %%mm2		\n\t"\
318
			"por %%mm5, %%mm2		\n\t"\
319
\
320
			"punpckhbw %%mm7, %%mm4		\n\t" /* 0G0G0G0G */\
321
			"punpckhbw %%mm7, %%mm1		\n\t" /* 0B0B0B0B */\
322
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R */\
323
\
324
			"psrlw $3, %%mm1		\n\t"\
325
			"psllw $3, %%mm4		\n\t"\
326
			"psllw $8, %%mm6		\n\t"\
327
\
328
			"pand g16Mask, %%mm4		\n\t"\
329
			"pand r16Mask, %%mm6		\n\t"\
330
\
331
			"por %%mm4, %%mm1		\n\t"\
332
			"por %%mm6, %%mm1		\n\t"\
333
\
334
			MOVNTQ(%%mm2, (%4, %%eax, 2))\
335
			MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
336
\
337
			"addl $8, %%eax			\n\t"\
338
			"cmpl %5, %%eax			\n\t"\
339
			" jb 1b				\n\t"
340

  
341
#define WRITEBGR15 \
342
			"movq %%mm2, %%mm1		\n\t" /* B */\
343
			"movq %%mm4, %%mm3		\n\t" /* G */\
344
			"movq %%mm5, %%mm6		\n\t" /* R */\
345
\
346
			"punpcklbw %%mm7, %%mm3		\n\t" /* 0G0G0G0G */\
347
			"punpcklbw %%mm7, %%mm2		\n\t" /* 0B0B0B0B */\
348
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R */\
349
\
350
			"psrlw $3, %%mm2		\n\t"\
351
			"psllw $2, %%mm3		\n\t"\
352
			"psllw $7, %%mm5		\n\t"\
353
\
354
			"pand g15Mask, %%mm3		\n\t"\
355
			"pand r15Mask, %%mm5		\n\t"\
356
\
357
			"por %%mm3, %%mm2		\n\t"\
358
			"por %%mm5, %%mm2		\n\t"\
359
\
360
			"punpckhbw %%mm7, %%mm4		\n\t" /* 0G0G0G0G */\
361
			"punpckhbw %%mm7, %%mm1		\n\t" /* 0B0B0B0B */\
362
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R */\
363
\
364
			"psrlw $3, %%mm1		\n\t"\
365
			"psllw $2, %%mm4		\n\t"\
366
			"psllw $7, %%mm6		\n\t"\
367
\
368
			"pand g15Mask, %%mm4		\n\t"\
369
			"pand r15Mask, %%mm6		\n\t"\
370
\
371
			"por %%mm4, %%mm1		\n\t"\
372
			"por %%mm6, %%mm1		\n\t"\
373
\
374
			MOVNTQ(%%mm2, (%4, %%eax, 2))\
375
			MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
376
\
377
			"addl $8, %%eax			\n\t"\
378
			"cmpl %5, %%eax			\n\t"\
379
			" jb 1b				\n\t"
380
// FIXME find a faster way to shuffle it to BGR24
381
#define WRITEBGR24 \
382
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
383
			"movq %%mm2, %%mm1		\n\t" /* B */\
384
			"movq %%mm5, %%mm6		\n\t" /* R */\
385
			"punpcklbw %%mm4, %%mm2		\n\t" /* GBGBGBGB 0 */\
386
			"punpcklbw %%mm7, %%mm5		\n\t" /* 0R0R0R0R 0 */\
387
			"punpckhbw %%mm4, %%mm1		\n\t" /* GBGBGBGB 2 */\
388
			"punpckhbw %%mm7, %%mm6		\n\t" /* 0R0R0R0R 2 */\
389
			"movq %%mm2, %%mm0		\n\t" /* GBGBGBGB 0 */\
390
			"movq %%mm1, %%mm3		\n\t" /* GBGBGBGB 2 */\
391
			"punpcklbw %%mm5, %%mm0		\n\t" /* 0RGB0RGB 0 */\
392
			"punpckhbw %%mm5, %%mm2		\n\t" /* 0RGB0RGB 1 */\
393
			"punpcklbw %%mm6, %%mm1		\n\t" /* 0RGB0RGB 2 */\
394
			"punpckhbw %%mm6, %%mm3		\n\t" /* 0RGB0RGB 3 */\
395
\
396
			"movq %%mm0, %%mm4		\n\t" /* 0RGB0RGB 0 */\
397
			"psrlq $8, %%mm0		\n\t" /* 00RGB0RG 0 */\
398
			"pand bm00000111, %%mm4		\n\t" /* 00000RGB 0 */\
399
			"pand bm11111000, %%mm0		\n\t" /* 00RGB000 0.5 */\
400
			"por %%mm4, %%mm0		\n\t" /* 00RGBRGB 0 */\
401
			"movq %%mm2, %%mm4		\n\t" /* 0RGB0RGB 1 */\
402
			"psllq $48, %%mm2		\n\t" /* GB000000 1 */\
403
			"por %%mm2, %%mm0		\n\t" /* GBRGBRGB 0 */\
404
\
405
			"movq %%mm4, %%mm2		\n\t" /* 0RGB0RGB 1 */\
406
			"psrld $16, %%mm4		\n\t" /* 000R000R 1 */\
407
			"psrlq $24, %%mm2		\n\t" /* 0000RGB0 1.5 */\
408
			"por %%mm4, %%mm2		\n\t" /* 000RRGBR 1 */\
409
			"pand bm00001111, %%mm2		\n\t" /* 0000RGBR 1 */\
410
			"movq %%mm1, %%mm4		\n\t" /* 0RGB0RGB 2 */\
411
			"psrlq $8, %%mm1		\n\t" /* 00RGB0RG 2 */\
412
			"pand bm00000111, %%mm4		\n\t" /* 00000RGB 2 */\
413
			"pand bm11111000, %%mm1		\n\t" /* 00RGB000 2.5 */\
414
			"por %%mm4, %%mm1		\n\t" /* 00RGBRGB 2 */\
415
			"movq %%mm1, %%mm4		\n\t" /* 00RGBRGB 2 */\
416
			"psllq $32, %%mm1		\n\t" /* BRGB0000 2 */\
417
			"por %%mm1, %%mm2		\n\t" /* BRGBRGBR 1 */\
418
\
419
			"psrlq $32, %%mm4		\n\t" /* 000000RG 2.5 */\
420
			"movq %%mm3, %%mm5		\n\t" /* 0RGB0RGB 3 */\
421
			"psrlq $8, %%mm3		\n\t" /* 00RGB0RG 3 */\
422
			"pand bm00000111, %%mm5		\n\t" /* 00000RGB 3 */\
423
			"pand bm11111000, %%mm3		\n\t" /* 00RGB000 3.5 */\
424
			"por %%mm5, %%mm3		\n\t" /* 00RGBRGB 3 */\
425
			"psllq $16, %%mm3		\n\t" /* RGBRGB00 3 */\
426
			"por %%mm4, %%mm3		\n\t" /* RGBRGBRG 2.5 */\
427
\
428
			"leal (%%eax, %%eax, 2), %%ebx	\n\t"\
429
			MOVNTQ(%%mm0, (%4, %%ebx))\
430
			MOVNTQ(%%mm2, 8(%4, %%ebx))\
431
			MOVNTQ(%%mm3, 16(%4, %%ebx))\
432
\
433
			"addl $8, %%eax			\n\t"\
434
			"cmpl %5, %%eax			\n\t"\
435
			" jb 1b				\n\t"
436

  
437

  
438
/**
439
 * vertical scale YV12 to RGB
440
 */
441
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
442
			    uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
443
{
444
	int yalpha1=yalpha^4095;
445
	int uvalpha1=uvalpha^4095;
446
	int i;
447

  
448
	if(fullUVIpol)
449
	{
450

  
451
#ifdef HAVE_MMX
452
		if(dstbpp == 32)
453
		{
454
			asm volatile(
455

  
456

  
457
FULL_YSCALEYUV2RGB
458
			"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
459
			"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
460

  
461
			"movq %%mm3, %%mm1		\n\t"
462
			"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
463
			"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
464

  
465
			MOVNTQ(%%mm3, (%4, %%eax, 4))
466
			MOVNTQ(%%mm1, 8(%4, %%eax, 4))
467

  
468
			"addl $4, %%eax			\n\t"
469
			"cmpl %5, %%eax			\n\t"
470
			" jb 1b				\n\t"
471

  
472

  
473
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
474
			"m" (yalpha1), "m" (uvalpha1)
475
			: "%eax"
476
			);
477
		}
478
		else if(dstbpp==24)
479
		{
480
			asm volatile(
481

  
482
FULL_YSCALEYUV2RGB
483

  
484
								// lsb ... msb
485
			"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
486
			"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
487

  
488
			"movq %%mm3, %%mm1		\n\t"
489
			"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
490
			"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
491

  
492
			"movq %%mm3, %%mm2		\n\t" // BGR0BGR0
493
			"psrlq $8, %%mm3		\n\t" // GR0BGR00
494
			"pand bm00000111, %%mm2		\n\t" // BGR00000
495
			"pand bm11111000, %%mm3		\n\t" // 000BGR00
496
			"por %%mm2, %%mm3		\n\t" // BGRBGR00
497
			"movq %%mm1, %%mm2		\n\t"
498
			"psllq $48, %%mm1		\n\t" // 000000BG
499
			"por %%mm1, %%mm3		\n\t" // BGRBGRBG
500

  
501
			"movq %%mm2, %%mm1		\n\t" // BGR0BGR0
502
			"psrld $16, %%mm2		\n\t" // R000R000
503
			"psrlq $24, %%mm1		\n\t" // 0BGR0000
504
			"por %%mm2, %%mm1		\n\t" // RBGRR000
505

  
506
			"movl %4, %%ebx			\n\t"
507
			"addl %%eax, %%ebx		\n\t"
508

  
509
#ifdef HAVE_MMX2
510
			//FIXME Alignment
511
			"movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
512
			"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
513
#else
514
			"movd %%mm3, (%%ebx, %%eax, 2)	\n\t"
515
			"psrlq $32, %%mm3		\n\t"
516
			"movd %%mm3, 4(%%ebx, %%eax, 2)	\n\t"
517
			"movd %%mm1, 8(%%ebx, %%eax, 2)	\n\t"
518
#endif
519
			"addl $4, %%eax			\n\t"
520
			"cmpl %5, %%eax			\n\t"
521
			" jb 1b				\n\t"
522

  
523
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
524
			"m" (yalpha1), "m" (uvalpha1)
525
			: "%eax", "%ebx"
526
			);
527
		}
528
		else if(dstbpp==15)
529
		{
530
			asm volatile(
531

  
532
FULL_YSCALEYUV2RGB
533
#ifdef DITHER1XBPP
534
			"paddusb b16Dither, %%mm1	\n\t"
535
			"paddusb b16Dither, %%mm0	\n\t"
536
			"paddusb b16Dither, %%mm3	\n\t"
537
#endif
538
			"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
539
			"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
540
			"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
541

  
542
			"psrlw $3, %%mm3		\n\t"
543
			"psllw $2, %%mm1		\n\t"
544
			"psllw $7, %%mm0		\n\t"
545
			"pand g15Mask, %%mm1		\n\t"
546
			"pand r15Mask, %%mm0		\n\t"
547

  
548
			"por %%mm3, %%mm1		\n\t"
549
			"por %%mm1, %%mm0		\n\t"
550

  
551
			MOVNTQ(%%mm0, (%4, %%eax, 2))
552

  
553
			"addl $4, %%eax			\n\t"
554
			"cmpl %5, %%eax			\n\t"
555
			" jb 1b				\n\t"
556

  
557
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
558
			"m" (yalpha1), "m" (uvalpha1)
559
			: "%eax"
560
			);
561
		}
562
		else if(dstbpp==16)
563
		{
564
			asm volatile(
565

  
566
FULL_YSCALEYUV2RGB
567
#ifdef DITHER1XBPP
568
			"paddusb g16Dither, %%mm1	\n\t"
569
			"paddusb b16Dither, %%mm0	\n\t"
570
			"paddusb b16Dither, %%mm3	\n\t"
571
#endif
572
			"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
573
			"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
574
			"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
575

  
576
			"psrlw $3, %%mm3		\n\t"
577
			"psllw $3, %%mm1		\n\t"
578
			"psllw $8, %%mm0		\n\t"
579
			"pand g16Mask, %%mm1		\n\t"
580
			"pand r16Mask, %%mm0		\n\t"
581

  
582
			"por %%mm3, %%mm1		\n\t"
583
			"por %%mm1, %%mm0		\n\t"
584

  
585
			MOVNTQ(%%mm0, (%4, %%eax, 2))
586

  
587
			"addl $4, %%eax			\n\t"
588
			"cmpl %5, %%eax			\n\t"
589
			" jb 1b				\n\t"
590

  
591
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
592
			"m" (yalpha1), "m" (uvalpha1)
593
			: "%eax"
594
			);
595
		}
596
#else
597
		if(dstbpp==32 || dstbpp==24)
598
		{
599
			for(i=0;i<dstw;i++){
600
				// vertical linear interpolation && yuv2rgb in a single step:
601
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
602
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
603
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
604
				dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
605
				dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
606
				dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
607
				dest+=dstbpp>>3;
608
			}
609
		}
610
		else if(dstbpp==16)
611
		{
612
			for(i=0;i<dstw;i++){
613
				// vertical linear interpolation && yuv2rgb in a single step:
614
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
615
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
616
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
617

  
618
				((uint16_t*)dest)[0] =
619
					(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
620
					(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
621
					(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
622
				dest+=2;
623
			}
624
		}
625
		else if(dstbpp==15)
626
		{
627
			for(i=0;i<dstw;i++){
628
				// vertical linear interpolation && yuv2rgb in a single step:
629
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
630
				int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
631
				int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
632

  
633
				((uint16_t*)dest)[0] =
634
					(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
635
					(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
636
					(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
637
				dest+=2;
638
			}
639
		}
640
#endif
641
	}//FULL_UV_IPOL
642
	else
643
	{
644
#ifdef HAVE_MMX
645
		if(dstbpp == 32)
646
		{
647
			asm volatile(
648
				YSCALEYUV2RGB
649
				WRITEBGR32
650

  
651
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
652
			"m" (yalpha1), "m" (uvalpha1)
653
			: "%eax"
654
			);
655
		}
656
		else if(dstbpp==24)
657
		{
658
			asm volatile(
659
				YSCALEYUV2RGB
660
				WRITEBGR24
661

  
662
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
663
			"m" (yalpha1), "m" (uvalpha1)
664
			: "%eax", "%ebx"
665
			);
666
		}
667
		else if(dstbpp==15)
668
		{
669
			asm volatile(
670
				YSCALEYUV2RGB
671
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
672
#ifdef DITHER1XBPP
673
				"paddusb b16Dither, %%mm2	\n\t"
674
				"paddusb b16Dither, %%mm4	\n\t"
675
				"paddusb b16Dither, %%mm5	\n\t"
676
#endif
677

  
678
				WRITEBGR15
679

  
680
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
681
			"m" (yalpha1), "m" (uvalpha1)
682
			: "%eax"
683
			);
684
		}
685
		else if(dstbpp==16)
686
		{
687
			asm volatile(
688
				YSCALEYUV2RGB
689
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
690
#ifdef DITHER1XBPP
691
				"paddusb g16Dither, %%mm2	\n\t"
692
				"paddusb b16Dither, %%mm4	\n\t"
693
				"paddusb b16Dither, %%mm5	\n\t"
694
#endif
695

  
696
				WRITEBGR16
697

  
698
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
699
			"m" (yalpha1), "m" (uvalpha1)
700
			: "%eax"
701
			);
702
		}
703
#else
704
//FIXME unroll C loop and dont recalculate UV
705
		if(dstbpp==32 || dstbpp==24)
706
		{
707
			for(i=0;i<dstw;i++){
708
				// vertical linear interpolation && yuv2rgb in a single step:
709
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
710
				int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
711
				int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
712
				dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
713
				dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
714
				dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
715
				dest+=dstbpp>>3;
716
			}
717
		}
718
		else if(dstbpp==16)
719
		{
720
			for(i=0;i<dstw;i++){
721
				// vertical linear interpolation && yuv2rgb in a single step:
722
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
723
				int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
724
				int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
725

  
726
				((uint16_t*)dest)[0] =
727
					(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
728
					(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
729
					(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
730
				dest+=2;
731
			}
732
		}
733
		else if(dstbpp==15)
734
		{
735
			for(i=0;i<dstw;i++){
736
				// vertical linear interpolation && yuv2rgb in a single step:
737
				int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
738
				int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
739
				int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
740

  
741
				((uint16_t*)dest)[0] =
742
					(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
743
					(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
744
					(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
745
				dest+=2;
746
			}
747
		}
748
#endif
749
	} //!FULL_UV_IPOL
750
}
751

  
752
/**
753
 * YV12 to RGB without scaling or interpolating
754
 */
755
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
756
			    uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
757
{
758
	int yalpha1=yalpha^4095;
759
	int uvalpha1=uvalpha^4095;
760
	int i;
761
	if(fullUVIpol || allwaysIpol)
762
	{
763
		yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
764
		return;
765
	}
766
#ifdef HAVE_MMX
767
		if(dstbpp == 32)
768
		{
769
			asm volatile(
770
				YSCALEYUV2RGB1
771
				WRITEBGR32
772
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
773
			"m" (yalpha1), "m" (uvalpha1)
774
			: "%eax"
775
			);
776
		}
777
		else if(dstbpp==24)
778
		{
779
			asm volatile(
780
				YSCALEYUV2RGB1
781
				WRITEBGR24
782
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
783
			"m" (yalpha1), "m" (uvalpha1)
784
			: "%eax", "%ebx"
785
			);
786
		}
787
		else if(dstbpp==15)
788
		{
789
			asm volatile(
790
				YSCALEYUV2RGB1
791
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
792
#ifdef DITHER1XBPP
793
				"paddusb b16Dither, %%mm2	\n\t"
794
				"paddusb b16Dither, %%mm4	\n\t"
795
				"paddusb b16Dither, %%mm5	\n\t"
796
#endif
797
				WRITEBGR15
798
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
799
			"m" (yalpha1), "m" (uvalpha1)
800
			: "%eax"
801
			);
802
		}
803
		else if(dstbpp==16)
804
		{
805
			asm volatile(
806
				YSCALEYUV2RGB1
807
		/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
808
#ifdef DITHER1XBPP
809
				"paddusb g16Dither, %%mm2	\n\t"
810
				"paddusb b16Dither, %%mm4	\n\t"
811
				"paddusb b16Dither, %%mm5	\n\t"
812
#endif
813

  
814
				WRITEBGR16
815
			:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
816
			"m" (yalpha1), "m" (uvalpha1)
817
			: "%eax"
818
			);
819
		}
820
#else
821
//FIXME unroll C loop and dont recalculate UV
822
		if(dstbpp==32 || dstbpp==24)
823
		{
824
			for(i=0;i<dstw;i++){
825
				// vertical linear interpolation && yuv2rgb in a single step:
826
				int Y=yuvtab_2568[buf0[i]>>7];
827
				int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
828
				int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
829
				dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
830
				dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
831
				dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
832
				dest+=dstbpp>>3;
833
			}
834
		}
835
		else if(dstbpp==16)
836
		{
837
			for(i=0;i<dstw;i++){
838
				// vertical linear interpolation && yuv2rgb in a single step:
839
				int Y=yuvtab_2568[buf0[i]>>7];
840
				int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
841
				int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
842

  
843
				((uint16_t*)dest)[0] =
844
					(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
845
					(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
846
					(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
847
				dest+=2;
848
			}
849
		}
850
		else if(dstbpp==15)
851
		{
852
			for(i=0;i<dstw;i++){
853
				// vertical linear interpolation && yuv2rgb in a single step:
854
				int Y=yuvtab_2568[buf0[i]>>7];
855
				int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
856
				int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
857

  
858
				((uint16_t*)dest)[0] =
859
					(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
860
					(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
861
					(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
862
				dest+=2;
863
			}
864
		}
865
#endif
866
}
867

  
868

  
869

  
69 870

  
70 871
// *** bilinear scaling and yuv->rgb conversion of yv12 slices:
71 872
// *** Note: it's called multiple times while decoding a frame, first time y==0
......
95 896
// used to detect a horizontal size change
96 897
static int old_dstw= -1;
97 898
static int old_s_xinc= -1;
98

  
99 899
#endif
900

  
100 901
int canMMX2BeUsed=0;
101 902
int srcWidth= (dstw*s_xinc + 0x8000)>>16;
903
int dstUVw= fullUVIpol ? dstw : dstw/2;
904

  
102 905

  
103 906
#ifdef HAVE_MMX2
104 907
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
......
111 914
// first and last pixel
112 915
if(canMMX2BeUsed) 	s_xinc+= 20;
113 916
else			s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
114
s_xinc2=s_xinc>>1;
115 917

  
918
if(fullUVIpol) 	s_xinc2= s_xinc>>1;
919
else		s_xinc2= s_xinc;
116 920
  // force calculation of the horizontal interpolation of the first line
117 921
  s_last_ypos=-99;
118 922
  s_last_y1pos=-99;
......
215 1019
				funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
216 1020
					a | (b<<2) | (c<<4) | (d<<6);
217 1021

  
1022
				// if we dont need to read 8 bytes than dont :), reduces the chance of
1023
				// crossing a cache line
1024
				if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1025

  
218 1026
				funnyYCode[fragmentLength*(i+4)/4]= RET;
219 1027
			}
220 1028
			xpos+=s_xinc;
221 1029
		}
222 1030

  
223 1031
		xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
224
		for(i=0; i<dstw/8; i++)
1032
		for(i=0; i<dstUVw/8; i++)
225 1033
		{
226 1034
			int xx=xpos>>16;
227 1035

  
......
238 1046
				funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
239 1047
					a | (b<<2) | (c<<4) | (d<<6);
240 1048

  
1049
				// if we dont need to read 8 bytes than dont :), reduces the chance of
1050
				// crossing a cache line
1051
				if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1052

  
241 1053
				funnyUVCode[fragmentLength*(i+4)/4]= RET;
242 1054
			}
243 1055
			xpos+=s_xinc2;
......
255 1067
	// points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
256 1068
    int srcuvpos= s_srcypos + s_yinc/2 - 0x8000;
257 1069
    int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
258
    int yalpha=((s_srcypos-1)&0xFFFF)>>7;
259
    int yalpha1=yalpha^511;
260
    int uvalpha=((srcuvpos-1)&0x1FFFF)>>8;
261
    int uvalpha1=uvalpha^511;
1070
    int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1071
    int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
262 1072
    uint16_t *buf0=pix_buf_y[y0&1];		// top line of the interpolated slice
263 1073
    uint16_t *buf1=pix_buf_y[((y0+1)&1)];	// bottom line of the interpolated slice
264 1074
    uint16_t *uvbuf0=pix_buf_uv[y1&1];		// top line of the interpolated slice
......
320 1130
			"xorl %%ecx, %%ecx		\n\t"
321 1131
			"xorl %%ebx, %%ebx		\n\t"
322 1132
			"movw %4, %%bx			\n\t" // (s_xinc*4)&0xFFFF
323
	//	"int $3\n\t"
324
			"call funnyYCode			\n\t"
325
			"movq temp0, %%mm2		\n\t"
326
			"xorl %%ecx, %%ecx		\n\t"
327
			"call funnyYCode			\n\t"
328
			"movq temp0, %%mm2		\n\t"
329
			"xorl %%ecx, %%ecx		\n\t"
330
			"call funnyYCode			\n\t"
331
			"movq temp0, %%mm2		\n\t"
332
			"xorl %%ecx, %%ecx		\n\t"
333
			"call funnyYCode			\n\t"
334
			"movq temp0, %%mm2		\n\t"
335
			"xorl %%ecx, %%ecx		\n\t"
336
			"call funnyYCode			\n\t"
337
			"movq temp0, %%mm2		\n\t"
338
			"xorl %%ecx, %%ecx		\n\t"
339
			"call funnyYCode			\n\t"
340
			"movq temp0, %%mm2		\n\t"
1133
#ifdef HAVE_MMX2
1134
#define FUNNY_Y_CODE \
1135
			"prefetchnta 1024(%%esi)	\n\t"\
1136
			"prefetchnta 1056(%%esi)	\n\t"\
1137
			"prefetchnta 1088(%%esi)	\n\t"\
1138
			"call funnyYCode		\n\t"\
1139
			"movq temp0, %%mm2		\n\t"\
341 1140
			"xorl %%ecx, %%ecx		\n\t"
342
			"call funnyYCode			\n\t"
343
			"movq temp0, %%mm2		\n\t"
1141
#else
1142
#define FUNNY_Y_CODE \
1143
			"call funnyYCode		\n\t"\
1144
			"movq temp0, %%mm2		\n\t"\
344 1145
			"xorl %%ecx, %%ecx		\n\t"
345
			"call funnyYCode			\n\t"
1146
#endif
1147
FUNNY_Y_CODE
1148
FUNNY_Y_CODE
1149
FUNNY_Y_CODE
1150
FUNNY_Y_CODE
1151
FUNNY_Y_CODE
1152
FUNNY_Y_CODE
1153
FUNNY_Y_CODE
1154
FUNNY_Y_CODE
1155

  
346 1156
			:: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),
347 1157
			"m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)
348 1158
			: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
......
352 1162
	else
353 1163
	{
354 1164
#endif
355
	//NO MMX just normal asm ... FIXME try/write funny MMX2 variant
356
	//FIXME add prefetch
1165
	//NO MMX just normal asm ...
357 1166
	asm volatile(
358 1167
		"xorl %%eax, %%eax		\n\t" // i
359 1168
		"xorl %%ebx, %%ebx		\n\t" // xx
......
438 1247
		"xorl %%ebx, %%ebx		\n\t"
439 1248
		"movw %4, %%bx			\n\t" // (s_xinc*4)&0xFFFF
440 1249

  
441
//	"int $3\n\t"
1250
#ifdef HAVE_MMX2
442 1251
#define FUNNYUVCODE \
443
		"call funnyUVCode		\n\t"\
444
		"movq temp0, %%mm2		\n\t"\
445
		"xorl %%ecx, %%ecx		\n\t"
1252
			"prefetchnta 1024(%%esi)	\n\t"\
1253
			"prefetchnta 1056(%%esi)	\n\t"\
1254
			"prefetchnta 1088(%%esi)	\n\t"\
1255
			"call funnyUVCode		\n\t"\
1256
			"movq temp0, %%mm2		\n\t"\
1257
			"xorl %%ecx, %%ecx		\n\t"
1258
#else
1259
#define FUNNYUVCODE \
1260
			"call funnyUVCode		\n\t"\
1261
			"movq temp0, %%mm2		\n\t"\
1262
			"xorl %%ecx, %%ecx		\n\t"
1263
#endif
446 1264

  
447 1265
FUNNYUVCODE
448 1266
FUNNYUVCODE
......
455 1273
FUNNYUVCODE
456 1274

  
457 1275

  
458

  
459 1276
		"xorl %%eax, %%eax		\n\t" // i
460 1277
		"movl %6, %%esi			\n\t" // src
461 1278
		"movl %1, %%edi			\n\t" // buf1
......
471 1288
FUNNYUVCODE
472 1289
FUNNYUVCODE
473 1290

  
474
		:: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" ((s_xinc2*4)>>16),
1291
		:: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16),
475 1292
		  "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)
476 1293
		: "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
477 1294
	);
478
		for(i=dstw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--)
1295
		for(i=dstUVw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--)
479 1296
		{
480 1297
			uvbuf1[i] = src1[srcWidth/2-1]*128;
481 1298
			uvbuf1[i+2048] = src2[srcWidth/2-1]*128;
......
516 1333
		"cmpl %2, %%eax			\n\t"
517 1334
		" jb 1b				\n\t"
518 1335

  
519

  
520
		:: "m" (src1), "m" (uvbuf1), "m" (dstw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
1336
		:: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
521 1337
		"r" (src2)
522 1338
		: "%eax", "%ebx", "%ecx", "%edi", "%esi"
523 1339
		);
......
525 1341
	} //if MMX2 cant be used
526 1342
#endif
527 1343
#else
528
      for(i=0;i<dstw;i++){
1344
      for(i=0;i<dstUVw;i++){
529 1345
	  register unsigned int xx=xpos>>16;
530 1346
          register unsigned int xalpha=(xpos&0xFFFF)>>9;
531 1347
	  uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
......
541 1357
	}
542 1358
    }
543 1359

  
1360
	if(ABS(s_yinc - 0x10000) < 10)
1361
		yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1362
	else
1363
		yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
544 1364

  
545
    // Note1: this code can be resticted to n*8 (or n*16) width lines to simplify optimization...
546
    // Re: Note1: ok n*4 for now
547
    // Note2: instead of using lookup tabs, mmx version could do the multiply...
548
    // Re: Note2: yep
549
    // Note3: maybe we should make separated 15/16, 24 and 32bpp version of this:
550
    // Re: done (32 & 16) and 16 has dithering :) but 16 is untested
551 1365
#ifdef HAVE_MMX
552
	//FIXME write lq version with less uv ...
553
	//FIXME reorder / optimize
554
	if(dstbpp == 32)
555
	{
556
		asm volatile(
557

  
558
#define YSCALEYUV2RGB \
559
		"pxor %%mm7, %%mm7		\n\t"\
560
		"movd %6, %%mm6			\n\t" /*yalpha1*/\
561
		"punpcklwd %%mm6, %%mm6		\n\t"\
562
		"punpcklwd %%mm6, %%mm6		\n\t"\
563
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
564
		"punpcklwd %%mm5, %%mm5		\n\t"\
565
		"punpcklwd %%mm5, %%mm5		\n\t"\
566
		"xorl %%eax, %%eax		\n\t"\
567
		"1:				\n\t"\
568
		"movq (%0, %%eax, 2), %%mm0	\n\t" /*buf0[eax]*/\
569
		"movq (%1, %%eax, 2), %%mm1	\n\t" /*buf1[eax]*/\
570
		"movq (%2, %%eax,2), %%mm2	\n\t" /* uvbuf0[eax]*/\
571
		"movq (%3, %%eax,2), %%mm3	\n\t" /* uvbuf1[eax]*/\
572
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
573
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
574
		"pmulhw %%mm6, %%mm0		\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
575
		"pmulhw %%mm5, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
576
		"psraw $7, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>7*/\
577
		"movq 4096(%2, %%eax,2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
578
		"psraw $7, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>7*/\
579
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
580
		"movq 4096(%3, %%eax,2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
581
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
582
		"psubw %%mm0, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
583
		"psubw w10, %%mm1		\n\t" /* Y-16*/\
584
		"psubw w80, %%mm3		\n\t" /* (U-128)*/\
585
		"psllw $3, %%mm1		\n\t" /* (y-16)*8*/\
586
		"psllw $3, %%mm3		\n\t" /*(U-128)8*/\
587
		"pmulhw yCoeff, %%mm1		\n\t"\
588
\
589
\
590
		"pmulhw %%mm5, %%mm4		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
591
		"movq %%mm3, %%mm2		\n\t" /* (U-128)8*/\
592
		"pmulhw ubCoeff, %%mm3		\n\t"\
593
		"psraw $7, %%mm0		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>7*/\
594
		"pmulhw ugCoeff, %%mm2		\n\t"\
595
		"paddw %%mm4, %%mm0		\n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
596
		"psubw w80, %%mm0		\n\t" /* (V-128)*/\
597
		"psllw $3, %%mm0		\n\t" /* (V-128)8*/\
598
\
599
\
600
		"movq %%mm0, %%mm4		\n\t" /* (V-128)8*/\
601
		"pmulhw vrCoeff, %%mm0		\n\t"\
602
		"pmulhw vgCoeff, %%mm4		\n\t"\
603
		"paddw %%mm1, %%mm3		\n\t" /* B*/\
604
		"paddw %%mm1, %%mm0		\n\t" /* R*/\
605
		"packuswb %%mm3, %%mm3		\n\t"\
606
\
607
		"packuswb %%mm0, %%mm0		\n\t"\
608
		"paddw %%mm4, %%mm2		\n\t"\
609
		"paddw %%mm2, %%mm1		\n\t" /* G*/\
610
\
611
		"packuswb %%mm1, %%mm1		\n\t"
612

  
613
YSCALEYUV2RGB
614
		"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
615
		"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
616

  
617
		"movq %%mm3, %%mm1		\n\t"
618
		"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
619
		"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
620
#ifdef HAVE_MMX2
621
		"movntq %%mm3, (%4, %%eax, 4)	\n\t"
622
		"movntq %%mm1, 8(%4, %%eax, 4)	\n\t"
623
#else
624
		"movq %%mm3, (%4, %%eax, 4)	\n\t"
625
		"movq %%mm1, 8(%4, %%eax, 4)	\n\t"
626
#endif
627
		"addl $4, %%eax			\n\t"
628
		"cmpl %5, %%eax			\n\t"
629
		" jb 1b				\n\t"
630

  
631

  
632
		:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
633
		"m" (yalpha1), "m" (uvalpha1)
634
		: "%eax"
635
		);
636
	}
637
	else if(dstbpp==24)
638
	{
639
		asm volatile(
640

  
641
YSCALEYUV2RGB
642

  
643
							// lsb ... msb
644
		"punpcklbw %%mm1, %%mm3		\n\t" // BGBGBGBG
645
		"punpcklbw %%mm7, %%mm0		\n\t" // R0R0R0R0
646

  
647
		"movq %%mm3, %%mm1		\n\t"
648
		"punpcklwd %%mm0, %%mm3		\n\t" // BGR0BGR0
649
		"punpckhwd %%mm0, %%mm1		\n\t" // BGR0BGR0
650

  
651
		"movq %%mm3, %%mm2		\n\t" // BGR0BGR0
652
		"psrlq $8, %%mm3		\n\t" // GR0BGR00
653
		"pand bm00000111, %%mm2		\n\t" // BGR00000
654
		"pand bm11111000, %%mm3		\n\t" // 000BGR00
655
		"por %%mm2, %%mm3		\n\t" // BGRBGR00
656
		"movq %%mm1, %%mm2		\n\t"
657
		"psllq $48, %%mm1		\n\t" // 000000BG
658
		"por %%mm1, %%mm3		\n\t" // BGRBGRBG
659

  
660
		"movq %%mm2, %%mm1		\n\t" // BGR0BGR0
661
		"psrld $16, %%mm2		\n\t" // R000R000
662
		"psrlq $24, %%mm1		\n\t" // 0BGR0000
663
		"por %%mm2, %%mm1		\n\t" // RBGRR000
664

  
665
		"movl %4, %%ebx			\n\t"
666
		"addl %%eax, %%ebx		\n\t"
667
#ifdef HAVE_MMX2
668
		//FIXME Alignment
669
		"movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
670
		"movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
671
#else
672
		"movd %%mm3, (%%ebx, %%eax, 2)	\n\t"
673
		"psrlq $32, %%mm3		\n\t"
674
		"movd %%mm3, 4(%%ebx, %%eax, 2)	\n\t"
675
		"movd %%mm1, 8(%%ebx, %%eax, 2)	\n\t"
676
#endif
677
		"addl $4, %%eax			\n\t"
678
		"cmpl %5, %%eax			\n\t"
679
		" jb 1b				\n\t"
680

  
681
		:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
682
		"m" (yalpha1), "m" (uvalpha1)
683
		: "%eax", "%ebx"
684
		);
685
	}
686
	else if(dstbpp==16)
687
	{
688
		asm volatile(
689

  
690
YSCALEYUV2RGB
691
#ifdef DITHER16BPP
692
		"paddusb g16Dither, %%mm1	\n\t"
693
		"paddusb b16Dither, %%mm0	\n\t"
694
		"paddusb b16Dither, %%mm3	\n\t"
695
#endif
696
		"punpcklbw %%mm7, %%mm1		\n\t" // 0G0G0G0G
697
		"punpcklbw %%mm7, %%mm3		\n\t" // 0B0B0B0B
698
		"punpcklbw %%mm7, %%mm0		\n\t" // 0R0R0R0R
699

  
700
		"psrlw $3, %%mm3		\n\t"
701
		"psllw $3, %%mm1		\n\t"
702
		"psllw $8, %%mm0		\n\t"
703
		"pand g16Mask, %%mm1		\n\t"
704
		"pand r16Mask, %%mm0		\n\t"
705

  
706
		"por %%mm3, %%mm1		\n\t"
707
		"por %%mm1, %%mm0		\n\t"
708
#ifdef HAVE_MMX2
709
		"movntq %%mm0, (%4, %%eax, 2)	\n\t"
710
#else
711
		"movq %%mm0, (%4, %%eax, 2)	\n\t"
712
#endif
713
		"addl $4, %%eax			\n\t"
714
		"cmpl %5, %%eax			\n\t"
715
		" jb 1b				\n\t"
716

  
717
		:: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
718
		"m" (yalpha1), "m" (uvalpha1)
719
		: "%eax"
720
		);
721
	}
722
#else
723
	if(dstbpp==32 || dstbpp==24)
724
	{
725
		for(i=0;i<dstw;i++){
726
			// vertical linear interpolation && yuv2rgb in a single step:
727
			int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
728
			int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
729
			int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
730
			dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
731
			dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
732
			dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
733
			dest+=dstbpp>>3;
734
		}
735
	}
736
	else if(dstbpp==16)
737
	{
738
		for(i=0;i<dstw;i++){
739
			// vertical linear interpolation && yuv2rgb in a single step:
740
			int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
741
			int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
742
			int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
743

  
744
			((uint16_t*)dest)[0] =
745
				(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
746
				(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
747
				(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
748
			dest+=2;
749
		}
750
	}
751
	else if(dstbpp==15) //15bit FIXME how do i figure out if its 15 or 16?
752
	{
753
		for(i=0;i<dstw;i++){
754
			// vertical linear interpolation && yuv2rgb in a single step:
755
			int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>16)];
756
			int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>16);
757
			int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>16);
758

  
759
			((uint16_t*)dest)[0] =
760
				(clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
761
				(clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
762
				(clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
763
			dest+=2;
764
		}
765
	}
766
#endif
767

  
768
	b16Dither= b16Dither1;
1366
    	b16Dither= b16Dither1;
769 1367
	b16Dither1= b16Dither2;
770 1368
	b16Dither2= b16Dither;
771 1369

  
772 1370
	g16Dither= g16Dither1;
773 1371
	g16Dither1= g16Dither2;
774 1372
	g16Dither2= g16Dither;
1373
#endif
775 1374
  }
776 1375

  
777 1376
#ifdef HAVE_3DNOW
postproc/swscale_template.c
3 3

  
4 4
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 5
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
// the parts written by michael are under GNU GPL
6 7

  
7 8
#include <inttypes.h>
8 9
#include "../config.h"
10
#include "swscale.h"
9 11

  
10 12
//#undef HAVE_MMX2
11 13
//#undef HAVE_MMX
12 14
//#undef ARCH_X86
13
#define DITHER16BPP
14
//#define ALT_ERROR
15
#define DITHER1XBPP
16
int fullUVIpol=0;
17
//disables the unscaled height version
18
int allwaysIpol=0;
15 19

  
16 20
#define RET 0xC3 //near return opcode
17 21
/*
18 22
NOTES
19 23

  
20
known BUGS with known cause (no bugreports please!)
21
code reads 1 sample too much (might cause a sig11)
24
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
25
horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
26

  
27
Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested)
28
BGR15 & BGR16 MMX verions support dithering
29
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
22 30

  
23 31
TODO
24
check alignment off everything
32
more intelligent missalignment avoidance for the horizontal scaler
25 33
*/
26 34

  
27
static uint64_t yCoeff=    0x2568256825682568LL;
28
static uint64_t ubCoeff=   0x3343334333433343LL;
29
static uint64_t vrCoeff=   0x40cf40cf40cf40cfLL;
30
static uint64_t ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
31
static uint64_t vgCoeff=   0xF36EF36EF36EF36ELL;
32
static uint64_t w80=       0x0080008000800080LL;
33
static uint64_t w10=       0x0010001000100010LL;
34
static uint64_t bm00000111=0x0000000000FFFFFFLL;
35
static uint64_t bm11111000=0xFFFFFFFFFF000000LL;
36

  
37
static uint64_t b16Dither= 0x0004000400040004LL;
38
static uint64_t b16Dither1=0x0004000400040004LL;
39
static uint64_t b16Dither2=0x0602060206020602LL;
40
static uint64_t g16Dither= 0x0002000200020002LL;
41
static uint64_t g16Dither1=0x0002000200020002LL;
42
static uint64_t g16Dither2=0x0301030103010301LL;
43

  
44
static uint64_t b16Mask=   0x001F001F001F001FLL;
45
static uint64_t g16Mask=   0x07E007E007E007E0LL;
46
static uint64_t r16Mask=   0xF800F800F800F800LL;
47
static uint64_t temp0;
35
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
36

  
37
#ifdef HAVE_MMX2
38
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39
#elif defined (HAVE_3DNOW)
40
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41
#endif
48 42

  
43
#ifdef HAVE_MMX2
44
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45
#else
46
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47
#endif
48

  
49

  
50
#ifdef HAVE_MMX
51
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
52
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x3343334333433343LL;
53
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x40cf40cf40cf40cfLL;
54
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
55
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xF36EF36EF36EF36ELL;
56
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
57
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
58
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
59
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
60
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
61
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
62

  
63
static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;
64
static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;
65
static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;
66
static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;
67
static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;
68
static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;
69

  
70
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
71
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
72
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
73
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
74
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
75
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
76

  
77
static uint64_t __attribute__((aligned(8))) temp0;
78
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
79
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
80
#endif
49 81

  
50 82
// temporary storage for 4 yuv lines:
51 83
// 16bit for now (mmx likes it more compact)
84
#ifdef HAVE_MMX
85
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
86
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
87
#else
52 88
static uint16_t pix_buf_y[4][2048];
53 89
static uint16_t pix_buf_uv[2][2048*2];
90
#endif
54 91

  
55 92
// clipping helper table for C implementations:
56 93
static unsigned char clip_table[768];
......
66 103
static uint8_t funnyYCode[10000];
67 104
static uint8_t funnyUVCode[10000];
68 105

  
106
#define FULL_YSCALEYUV2RGB \
107
		"pxor %%mm7, %%mm7		\n\t"\
108
		"movd %6, %%mm6			\n\t" /*yalpha1*/\
109
		"punpcklwd %%mm6, %%mm6		\n\t"\
110
		"punpcklwd %%mm6, %%mm6		\n\t"\
111
		"movd %7, %%mm5			\n\t" /*uvalpha1*/\
112
		"punpcklwd %%mm5, %%mm5		\n\t"\
113
		"punpcklwd %%mm5, %%mm5		\n\t"\
114
		"xorl %%eax, %%eax		\n\t"\
115
		"1:				\n\t"\
116
		"movq (%0, %%eax, 2), %%mm0	\n\t" /*buf0[eax]*/\
117
		"movq (%1, %%eax, 2), %%mm1	\n\t" /*buf1[eax]*/\
118
		"movq (%2, %%eax,2), %%mm2	\n\t" /* uvbuf0[eax]*/\
119
		"movq (%3, %%eax,2), %%mm3	\n\t" /* uvbuf1[eax]*/\
120
		"psubw %%mm1, %%mm0		\n\t" /* buf0[eax] - buf1[eax]*/\
121
		"psubw %%mm3, %%mm2		\n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
122
		"pmulhw %%mm6, %%mm0		\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
123
		"pmulhw %%mm5, %%mm2		\n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
124
		"psraw $4, %%mm1		\n\t" /* buf0[eax] - buf1[eax] >>4*/\
125
		"movq 4096(%2, %%eax,2), %%mm4	\n\t" /* uvbuf0[eax+2048]*/\
126
		"psraw $4, %%mm3		\n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
127
		"paddw %%mm0, %%mm1		\n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
128
		"movq 4096(%3, %%eax,2), %%mm0	\n\t" /* uvbuf1[eax+2048]*/\
129
		"paddw %%mm2, %%mm3		\n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
130
		"psubw %%mm0, %%mm4		\n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
131
		"psubw w80, %%mm1		\n\t" /* 8(Y-16)*/\
132
		"psubw w400, %%mm3		\n\t" /* 8(U-128)*/\
133
		"pmulhw yCoeff, %%mm1		\n\t"\
134
\
135
\
136
		"pmulhw %%mm5, %%mm4		\n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff