Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ 3fe8e8f0

History | View | Annotate | Download (54.1 KB)

1

    
2
// Software scaling and colorspace conversion routines for MPlayer
3

    
4
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
// the parts written by michael are under GNU GPL
7

    
8
#include <inttypes.h>
9
#include <string.h>
10
#include "../config.h"
11
#include "swscale.h"
12
#include "../mmx_defs.h"
13
#undef MOVNTQ
14
#undef PAVGB
15

    
16
//#undef HAVE_MMX2
17
//#undef HAVE_MMX
18
//#undef ARCH_X86
19
#define DITHER1XBPP
20
int fullUVIpol=0;
21
//disables the unscaled height version
22
int allwaysIpol=0;
23

    
24
#define RET 0xC3 //near return opcode
25
/*
26
NOTES
27

28
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
29
horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
30

31
Supported output formats BGR15 BGR16 BGR24 BGR32
32
BGR15 & BGR16 MMX verions support dithering
33
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
34

35
TODO
36
more intelligent missalignment avoidance for the horizontal scaler
37
bicubic scaler
38
dither in C
39
change the distance of the u & v buffer
40
*/
41

    
42
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
43
#define MIN(a,b) ((a) > (b) ? (b) : (a))
44
#define MAX(a,b) ((a) < (b) ? (b) : (a))
45

    
46
#ifdef HAVE_MMX2
47
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
48
#elif defined (HAVE_3DNOW)
49
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
50
#endif
51

    
52
#ifdef HAVE_MMX2
53
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
54
#else
55
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
56
#endif
57

    
58

    
59
#ifdef HAVE_MMX
60
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
61
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
62
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
63
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
64
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
65
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
66
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
67
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
68
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
69
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
70
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
71
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
72
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
73

    
74
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
75
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
76
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
77
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
78

    
79
static uint64_t __attribute__((aligned(8))) dither4[2]={
80
        0x0103010301030103LL,
81
        0x0200020002000200LL,};
82

    
83
static uint64_t __attribute__((aligned(8))) dither8[2]={
84
        0x0602060206020602LL,
85
        0x0004000400040004LL,};
86

    
87
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
88
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
89
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
90
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
91
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
92
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
93

    
94
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
95
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
96
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
97

    
98
static uint64_t __attribute__((aligned(8))) temp0;
99
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
100
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
101
#endif
102

    
103
// temporary storage for 4 yuv lines:
104
// 16bit for now (mmx likes it more compact)
105
#ifdef HAVE_MMX
106
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
107
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
108
#else
109
static uint16_t pix_buf_y[4][2048];
110
static uint16_t pix_buf_uv[2][2048*2];
111
#endif
112

    
113
// clipping helper table for C implementations:
114
static unsigned char clip_table[768];
115

    
116
static unsigned short clip_table16b[768];
117
static unsigned short clip_table16g[768];
118
static unsigned short clip_table16r[768];
119
static unsigned short clip_table15b[768];
120
static unsigned short clip_table15g[768];
121
static unsigned short clip_table15r[768];
122

    
123
// yuv->rgb conversion tables:
124
static    int yuvtab_2568[256];
125
static    int yuvtab_3343[256];
126
static    int yuvtab_0c92[256];
127
static    int yuvtab_1a1e[256];
128
static    int yuvtab_40cf[256];
129

    
130
#ifdef HAVE_MMX2
131
static uint8_t funnyYCode[10000];
132
static uint8_t funnyUVCode[10000];
133
#endif
134

    
135
static int canMMX2BeUsed=0;
136

    
137
#define FULL_YSCALEYUV2RGB \
138
                "pxor %%mm7, %%mm7                \n\t"\
139
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
140
                "punpcklwd %%mm6, %%mm6                \n\t"\
141
                "punpcklwd %%mm6, %%mm6                \n\t"\
142
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
143
                "punpcklwd %%mm5, %%mm5                \n\t"\
144
                "punpcklwd %%mm5, %%mm5                \n\t"\
145
                "xorl %%eax, %%eax                \n\t"\
146
                "1:                                \n\t"\
147
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
148
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
149
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
150
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
151
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
152
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
153
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
154
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
155
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
156
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
157
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
158
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
159
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
160
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
161
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
162
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
163
                "psubw w400, %%mm3                \n\t" /* 8(U-128)*/\
164
                "pmulhw yCoeff, %%mm1                \n\t"\
165
\
166
\
167
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
168
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
169
                "pmulhw ubCoeff, %%mm3                \n\t"\
170
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
171
                "pmulhw ugCoeff, %%mm2                \n\t"\
172
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
173
                "psubw w400, %%mm0                \n\t" /* (V-128)8*/\
174
\
175
\
176
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
177
                "pmulhw vrCoeff, %%mm0                \n\t"\
178
                "pmulhw vgCoeff, %%mm4                \n\t"\
179
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
180
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
181
                "packuswb %%mm3, %%mm3                \n\t"\
182
\
183
                "packuswb %%mm0, %%mm0                \n\t"\
184
                "paddw %%mm4, %%mm2                \n\t"\
185
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
186
\
187
                "packuswb %%mm1, %%mm1                \n\t"
188

    
189
#define YSCALEYUV2RGB \
190
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
191
                "punpcklwd %%mm6, %%mm6                \n\t"\
192
                "punpcklwd %%mm6, %%mm6                \n\t"\
193
                "movq %%mm6, asm_yalpha1        \n\t"\
194
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
195
                "punpcklwd %%mm5, %%mm5                \n\t"\
196
                "punpcklwd %%mm5, %%mm5                \n\t"\
197
                "movq %%mm5, asm_uvalpha1        \n\t"\
198
                "xorl %%eax, %%eax                \n\t"\
199
                "1:                                \n\t"\
200
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
201
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
202
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
203
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
204
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
205
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
206
                "movq asm_uvalpha1, %%mm0        \n\t"\
207
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
208
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
209
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
210
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
211
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
212
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
213
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
214
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
215
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
216
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
217
                "pmulhw ugCoeff, %%mm3                \n\t"\
218
                "pmulhw vgCoeff, %%mm4                \n\t"\
219
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
220
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
221
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
222
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
223
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
224
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
225
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
226
                "pmulhw asm_yalpha1, %%mm0        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
227
                "pmulhw asm_yalpha1, %%mm6        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
228
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
229
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
230
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
231
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
232
                "pmulhw ubCoeff, %%mm2                \n\t"\
233
                "pmulhw vrCoeff, %%mm5                \n\t"\
234
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
235
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
236
                "pmulhw yCoeff, %%mm1                \n\t"\
237
                "pmulhw yCoeff, %%mm7                \n\t"\
238
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
239
                "paddw %%mm3, %%mm4                \n\t"\
240
                "movq %%mm2, %%mm0                \n\t"\
241
                "movq %%mm5, %%mm6                \n\t"\
242
                "movq %%mm4, %%mm3                \n\t"\
243
                "punpcklwd %%mm2, %%mm2                \n\t"\
244
                "punpcklwd %%mm5, %%mm5                \n\t"\
245
                "punpcklwd %%mm4, %%mm4                \n\t"\
246
                "paddw %%mm1, %%mm2                \n\t"\
247
                "paddw %%mm1, %%mm5                \n\t"\
248
                "paddw %%mm1, %%mm4                \n\t"\
249
                "punpckhwd %%mm0, %%mm0                \n\t"\
250
                "punpckhwd %%mm6, %%mm6                \n\t"\
251
                "punpckhwd %%mm3, %%mm3                \n\t"\
252
                "paddw %%mm7, %%mm0                \n\t"\
253
                "paddw %%mm7, %%mm6                \n\t"\
254
                "paddw %%mm7, %%mm3                \n\t"\
255
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
256
                "packuswb %%mm0, %%mm2                \n\t"\
257
                "packuswb %%mm6, %%mm5                \n\t"\
258
                "packuswb %%mm3, %%mm4                \n\t"\
259
                "pxor %%mm7, %%mm7                \n\t"
260

    
261
#define YSCALEYUV2RGB1 \
262
                "xorl %%eax, %%eax                \n\t"\
263
                "1:                                \n\t"\
264
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
265
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
266
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
267
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
268
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
269
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
270
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
271
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
272
                "pmulhw ugCoeff, %%mm3                \n\t"\
273
                "pmulhw vgCoeff, %%mm4                \n\t"\
274
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
275
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
276
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
277
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
278
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279
                "pmulhw ubCoeff, %%mm2                \n\t"\
280
                "pmulhw vrCoeff, %%mm5                \n\t"\
281
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
282
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
283
                "pmulhw yCoeff, %%mm1                \n\t"\
284
                "pmulhw yCoeff, %%mm7                \n\t"\
285
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
286
                "paddw %%mm3, %%mm4                \n\t"\
287
                "movq %%mm2, %%mm0                \n\t"\
288
                "movq %%mm5, %%mm6                \n\t"\
289
                "movq %%mm4, %%mm3                \n\t"\
290
                "punpcklwd %%mm2, %%mm2                \n\t"\
291
                "punpcklwd %%mm5, %%mm5                \n\t"\
292
                "punpcklwd %%mm4, %%mm4                \n\t"\
293
                "paddw %%mm1, %%mm2                \n\t"\
294
                "paddw %%mm1, %%mm5                \n\t"\
295
                "paddw %%mm1, %%mm4                \n\t"\
296
                "punpckhwd %%mm0, %%mm0                \n\t"\
297
                "punpckhwd %%mm6, %%mm6                \n\t"\
298
                "punpckhwd %%mm3, %%mm3                \n\t"\
299
                "paddw %%mm7, %%mm0                \n\t"\
300
                "paddw %%mm7, %%mm6                \n\t"\
301
                "paddw %%mm7, %%mm3                \n\t"\
302
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
303
                "packuswb %%mm0, %%mm2                \n\t"\
304
                "packuswb %%mm6, %%mm5                \n\t"\
305
                "packuswb %%mm3, %%mm4                \n\t"\
306
                "pxor %%mm7, %%mm7                \n\t"
307

    
308
// do vertical chrominance interpolation
309
#define YSCALEYUV2RGB1b \
310
                "xorl %%eax, %%eax                \n\t"\
311
                "1:                                \n\t"\
312
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
313
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
314
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
315
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
316
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
317
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
318
                "psrlw $5, %%mm3                \n\t"\
319
                "psrlw $5, %%mm4                \n\t"\
320
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
321
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
322
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
323
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
324
                "pmulhw ugCoeff, %%mm3                \n\t"\
325
                "pmulhw vgCoeff, %%mm4                \n\t"\
326
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
327
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
328
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
329
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
330
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
331
                "pmulhw ubCoeff, %%mm2                \n\t"\
332
                "pmulhw vrCoeff, %%mm5                \n\t"\
333
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
334
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
335
                "pmulhw yCoeff, %%mm1                \n\t"\
336
                "pmulhw yCoeff, %%mm7                \n\t"\
337
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
338
                "paddw %%mm3, %%mm4                \n\t"\
339
                "movq %%mm2, %%mm0                \n\t"\
340
                "movq %%mm5, %%mm6                \n\t"\
341
                "movq %%mm4, %%mm3                \n\t"\
342
                "punpcklwd %%mm2, %%mm2                \n\t"\
343
                "punpcklwd %%mm5, %%mm5                \n\t"\
344
                "punpcklwd %%mm4, %%mm4                \n\t"\
345
                "paddw %%mm1, %%mm2                \n\t"\
346
                "paddw %%mm1, %%mm5                \n\t"\
347
                "paddw %%mm1, %%mm4                \n\t"\
348
                "punpckhwd %%mm0, %%mm0                \n\t"\
349
                "punpckhwd %%mm6, %%mm6                \n\t"\
350
                "punpckhwd %%mm3, %%mm3                \n\t"\
351
                "paddw %%mm7, %%mm0                \n\t"\
352
                "paddw %%mm7, %%mm6                \n\t"\
353
                "paddw %%mm7, %%mm3                \n\t"\
354
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
355
                "packuswb %%mm0, %%mm2                \n\t"\
356
                "packuswb %%mm6, %%mm5                \n\t"\
357
                "packuswb %%mm3, %%mm4                \n\t"\
358
                "pxor %%mm7, %%mm7                \n\t"
359

    
360
#define WRITEBGR32 \
361
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
362
                        "movq %%mm2, %%mm1                \n\t" /* B */\
363
                        "movq %%mm5, %%mm6                \n\t" /* R */\
364
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
365
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
366
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
367
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
368
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
369
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
370
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
371
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
372
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
373
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
374
\
375
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
376
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
377
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
378
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
379
\
380
                        "addl $8, %%eax                        \n\t"\
381
                        "cmpl %5, %%eax                        \n\t"\
382
                        " jb 1b                                \n\t"
383

    
384
#define WRITEBGR16 \
385
                        "pand bF8, %%mm2                \n\t" /* B */\
386
                        "pand bFC, %%mm4                \n\t" /* G */\
387
                        "pand bF8, %%mm5                \n\t" /* R */\
388
                        "psrlq $3, %%mm2                \n\t"\
389
\
390
                        "movq %%mm2, %%mm1                \n\t"\
391
                        "movq %%mm4, %%mm3                \n\t"\
392
\
393
                        "punpcklbw %%mm7, %%mm3                \n\t"\
394
                        "punpcklbw %%mm5, %%mm2                \n\t"\
395
                        "punpckhbw %%mm7, %%mm4                \n\t"\
396
                        "punpckhbw %%mm5, %%mm1                \n\t"\
397
\
398
                        "psllq $3, %%mm3                \n\t"\
399
                        "psllq $3, %%mm4                \n\t"\
400
\
401
                        "por %%mm3, %%mm2                \n\t"\
402
                        "por %%mm4, %%mm1                \n\t"\
403
\
404
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
405
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
406
\
407
                        "addl $8, %%eax                        \n\t"\
408
                        "cmpl %5, %%eax                        \n\t"\
409
                        " jb 1b                                \n\t"
410

    
411
#define WRITEBGR15 \
412
                        "pand bF8, %%mm2                \n\t" /* B */\
413
                        "pand bF8, %%mm4                \n\t" /* G */\
414
                        "pand bF8, %%mm5                \n\t" /* R */\
415
                        "psrlq $3, %%mm2                \n\t"\
416
                        "psrlq $1, %%mm5                \n\t"\
417
\
418
                        "movq %%mm2, %%mm1                \n\t"\
419
                        "movq %%mm4, %%mm3                \n\t"\
420
\
421
                        "punpcklbw %%mm7, %%mm3                \n\t"\
422
                        "punpcklbw %%mm5, %%mm2                \n\t"\
423
                        "punpckhbw %%mm7, %%mm4                \n\t"\
424
                        "punpckhbw %%mm5, %%mm1                \n\t"\
425
\
426
                        "psllq $2, %%mm3                \n\t"\
427
                        "psllq $2, %%mm4                \n\t"\
428
\
429
                        "por %%mm3, %%mm2                \n\t"\
430
                        "por %%mm4, %%mm1                \n\t"\
431
\
432
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
433
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
434
\
435
                        "addl $8, %%eax                        \n\t"\
436
                        "cmpl %5, %%eax                        \n\t"\
437
                        " jb 1b                                \n\t"
438

    
439
#define WRITEBGR24OLD \
440
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
441
                        "movq %%mm2, %%mm1                \n\t" /* B */\
442
                        "movq %%mm5, %%mm6                \n\t" /* R */\
443
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
444
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
445
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
446
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
447
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
448
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
449
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
450
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
451
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
452
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
453
\
454
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
455
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
456
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 0 */\
457
                        "pand bm11111000, %%mm0                \n\t" /* 00RGB000 0.5 */\
458
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
459
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
460
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
461
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
462
\
463
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
464
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
465
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
466
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
467
                        "pand bm00001111, %%mm2                \n\t" /* 0000RGBR 1 */\
468
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
469
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
470
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 2 */\
471
                        "pand bm11111000, %%mm1                \n\t" /* 00RGB000 2.5 */\
472
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
473
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
474
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
475
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
476
\
477
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
478
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
479
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
480
                        "pand bm00000111, %%mm5                \n\t" /* 00000RGB 3 */\
481
                        "pand bm11111000, %%mm3                \n\t" /* 00RGB000 3.5 */\
482
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
483
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
484
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
485
\
486
                        MOVNTQ(%%mm0, (%%ebx))\
487
                        MOVNTQ(%%mm2, 8(%%ebx))\
488
                        MOVNTQ(%%mm3, 16(%%ebx))\
489
                        "addl $24, %%ebx                \n\t"\
490
\
491
                        "addl $8, %%eax                        \n\t"\
492
                        "cmpl %5, %%eax                        \n\t"\
493
                        " jb 1b                                \n\t"
494

    
495
#define WRITEBGR24MMX \
496
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
497
                        "movq %%mm2, %%mm1                \n\t" /* B */\
498
                        "movq %%mm5, %%mm6                \n\t" /* R */\
499
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
500
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
501
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
502
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
503
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
504
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
505
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
506
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
507
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
508
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
509
\
510
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
511
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
512
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
513
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
514
\
515
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
516
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
517
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
518
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
519
\
520
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
521
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
522
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
523
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
524
\
525
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
526
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
527
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
528
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
529
                        MOVNTQ(%%mm0, (%%ebx))\
530
\
531
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
532
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
533
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
534
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
535
                        MOVNTQ(%%mm6, 8(%%ebx))\
536
\
537
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
538
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
539
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
540
                        MOVNTQ(%%mm5, 16(%%ebx))\
541
\
542
                        "addl $24, %%ebx                \n\t"\
543
\
544
                        "addl $8, %%eax                        \n\t"\
545
                        "cmpl %5, %%eax                        \n\t"\
546
                        " jb 1b                                \n\t"
547

    
548
#define WRITEBGR24MMX2 \
549
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
550
                        "movq M24A, %%mm0                \n\t"\
551
                        "movq M24C, %%mm7                \n\t"\
552
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
553
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
554
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
555
\
556
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
557
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
558
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
559
\
560
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
561
                        "por %%mm1, %%mm6                \n\t"\
562
                        "por %%mm3, %%mm6                \n\t"\
563
                        MOVNTQ(%%mm6, (%%ebx))\
564
\
565
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
566
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
567
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
568
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
569
\
570
                        "pand M24B, %%mm1                \n\t" /* B5       B4        B3    */\
571
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
572
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
573
\
574
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
575
                        "por %%mm3, %%mm6                \n\t"\
576
                        MOVNTQ(%%mm6, 8(%%ebx))\
577
\
578
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
579
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
580
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
581
\
582
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
583
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
584
                        "pand M24B, %%mm6                \n\t" /* R7       R6        R5    */\
585
\
586
                        "por %%mm1, %%mm3                \n\t"\
587
                        "por %%mm3, %%mm6                \n\t"\
588
                        MOVNTQ(%%mm6, 16(%%ebx))\
589
\
590
                        "addl $24, %%ebx                \n\t"\
591
\
592
                        "addl $8, %%eax                        \n\t"\
593
                        "cmpl %5, %%eax                        \n\t"\
594
                        " jb 1b                                \n\t"
595

    
596
#ifdef HAVE_MMX2
597
#define WRITEBGR24 WRITEBGR24MMX2
598
#else
599
#define WRITEBGR24 WRITEBGR24MMX
600
#endif
601

    
602
#ifdef HAVE_MMX
603
void in_asm_used_var_warning_killer()
604
{
605
 int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
606
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
607
 M24A+M24B+M24C;
608
 if(i) i=0;
609
}
610
#endif
611

    
612
static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
613
                           uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha)
614
{
615
        int yalpha1=yalpha^4095;
616
        int uvalpha1=uvalpha^4095;
617
        int i;
618

    
619
        asm volatile ("\n\t"::: "memory");
620

    
621
        for(i=0;i<dstw;i++)
622
        {
623
                ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19;
624
        }
625

    
626
        if(uvalpha != -1)
627
        {
628
                for(i=0; i<(dstw>>1); i++)
629
                {
630
                        ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19;
631
                        ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;
632
                }
633
        }
634
}
635

    
636
/**
637
 * vertical scale YV12 to RGB
638
 */
639
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
640
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
641
{
642
        int yalpha1=yalpha^4095;
643
        int uvalpha1=uvalpha^4095;
644

    
645
        if(fullUVIpol)
646
        {
647

    
648
#ifdef HAVE_MMX
649
                if(dstbpp == 32)
650
                {
651
                        asm volatile(
652

    
653

    
654
FULL_YSCALEYUV2RGB
655
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
656
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
657

    
658
                        "movq %%mm3, %%mm1                \n\t"
659
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
660
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
661

    
662
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
663
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
664

    
665
                        "addl $4, %%eax                        \n\t"
666
                        "cmpl %5, %%eax                        \n\t"
667
                        " jb 1b                                \n\t"
668

    
669

    
670
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
671
                        "m" (yalpha1), "m" (uvalpha1)
672
                        : "%eax"
673
                        );
674
                }
675
                else if(dstbpp==24)
676
                {
677
                        asm volatile(
678

    
679
FULL_YSCALEYUV2RGB
680

    
681
                                                                // lsb ... msb
682
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
683
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
684

    
685
                        "movq %%mm3, %%mm1                \n\t"
686
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
687
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
688

    
689
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
690
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
691
                        "pand bm00000111, %%mm2                \n\t" // BGR00000
692
                        "pand bm11111000, %%mm3                \n\t" // 000BGR00
693
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
694
                        "movq %%mm1, %%mm2                \n\t"
695
                        "psllq $48, %%mm1                \n\t" // 000000BG
696
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
697

    
698
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
699
                        "psrld $16, %%mm2                \n\t" // R000R000
700
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
701
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
702

    
703
                        "movl %4, %%ebx                        \n\t"
704
                        "addl %%eax, %%ebx                \n\t"
705

    
706
#ifdef HAVE_MMX2
707
                        //FIXME Alignment
708
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
709
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
710
#else
711
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
712
                        "psrlq $32, %%mm3                \n\t"
713
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
714
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
715
#endif
716
                        "addl $4, %%eax                        \n\t"
717
                        "cmpl %5, %%eax                        \n\t"
718
                        " jb 1b                                \n\t"
719

    
720
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
721
                        "m" (yalpha1), "m" (uvalpha1)
722
                        : "%eax", "%ebx"
723
                        );
724
                }
725
                else if(dstbpp==15)
726
                {
727
                        asm volatile(
728

    
729
FULL_YSCALEYUV2RGB
730
#ifdef DITHER1XBPP
731
                        "paddusb g5Dither, %%mm1        \n\t"
732
                        "paddusb r5Dither, %%mm0        \n\t"
733
                        "paddusb b5Dither, %%mm3        \n\t"
734
#endif
735
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
736
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
737
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
738

    
739
                        "psrlw $3, %%mm3                \n\t"
740
                        "psllw $2, %%mm1                \n\t"
741
                        "psllw $7, %%mm0                \n\t"
742
                        "pand g15Mask, %%mm1                \n\t"
743
                        "pand r15Mask, %%mm0                \n\t"
744

    
745
                        "por %%mm3, %%mm1                \n\t"
746
                        "por %%mm1, %%mm0                \n\t"
747

    
748
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
749

    
750
                        "addl $4, %%eax                        \n\t"
751
                        "cmpl %5, %%eax                        \n\t"
752
                        " jb 1b                                \n\t"
753

    
754
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
755
                        "m" (yalpha1), "m" (uvalpha1)
756
                        : "%eax"
757
                        );
758
                }
759
                else if(dstbpp==16)
760
                {
761
                        asm volatile(
762

    
763
FULL_YSCALEYUV2RGB
764
#ifdef DITHER1XBPP
765
                        "paddusb g6Dither, %%mm1        \n\t"
766
                        "paddusb r5Dither, %%mm0        \n\t"
767
                        "paddusb b5Dither, %%mm3        \n\t"
768
#endif
769
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
770
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
771
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
772

    
773
                        "psrlw $3, %%mm3                \n\t"
774
                        "psllw $3, %%mm1                \n\t"
775
                        "psllw $8, %%mm0                \n\t"
776
                        "pand g16Mask, %%mm1                \n\t"
777
                        "pand r16Mask, %%mm0                \n\t"
778

    
779
                        "por %%mm3, %%mm1                \n\t"
780
                        "por %%mm1, %%mm0                \n\t"
781

    
782
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
783

    
784
                        "addl $4, %%eax                        \n\t"
785
                        "cmpl %5, %%eax                        \n\t"
786
                        " jb 1b                                \n\t"
787

    
788
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
789
                        "m" (yalpha1), "m" (uvalpha1)
790
                        : "%eax"
791
                        );
792
                }
793
#else
794
                asm volatile ("\n\t"::: "memory");
795

    
796
                if(dstbpp==32 || dstbpp==24)
797
                {
798
                        int i;
799
                        for(i=0;i<dstw;i++){
800
                                // vertical linear interpolation && yuv2rgb in a single step:
801
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
802
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
803
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
804
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
805
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
806
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
807
                                dest+=dstbpp>>3;
808
                        }
809
                }
810
                else if(dstbpp==16)
811
                {
812
                        int i;
813
                        for(i=0;i<dstw;i++){
814
                                // vertical linear interpolation && yuv2rgb in a single step:
815
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
816
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
817
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
818

    
819
                                ((uint16_t*)dest)[i] =
820
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
821
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
822
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
823
                        }
824
                }
825
                else if(dstbpp==15)
826
                {
827
                        int i;
828
                        for(i=0;i<dstw;i++){
829
                                // vertical linear interpolation && yuv2rgb in a single step:
830
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
831
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
832
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
833

    
834
                                ((uint16_t*)dest)[i] =
835
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
836
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
837
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
838
                        }
839
                }
840
#endif
841
        }//FULL_UV_IPOL
842
        else
843
        {
844
#ifdef HAVE_MMX
845
                if(dstbpp == 32)
846
                {
847
                        asm volatile(
848
                                YSCALEYUV2RGB
849
                                WRITEBGR32
850

    
851
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
852
                        "m" (yalpha1), "m" (uvalpha1)
853
                        : "%eax"
854
                        );
855
                }
856
                else if(dstbpp==24)
857
                {
858
                        asm volatile(
859
                                "movl %4, %%ebx                        \n\t"
860
                                YSCALEYUV2RGB
861
                                WRITEBGR24
862

    
863
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
864
                        "m" (yalpha1), "m" (uvalpha1)
865
                        : "%eax", "%ebx"
866
                        );
867
                }
868
                else if(dstbpp==15)
869
                {
870
                        asm volatile(
871
                                YSCALEYUV2RGB
872
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
873
#ifdef DITHER1XBPP
874
                                "paddusb b5Dither, %%mm2        \n\t"
875
                                "paddusb g5Dither, %%mm4        \n\t"
876
                                "paddusb r5Dither, %%mm5        \n\t"
877
#endif
878

    
879
                                WRITEBGR15
880

    
881
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
882
                        "m" (yalpha1), "m" (uvalpha1)
883
                        : "%eax"
884
                        );
885
                }
886
                else if(dstbpp==16)
887
                {
888
                        asm volatile(
889
                                YSCALEYUV2RGB
890
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
891
#ifdef DITHER1XBPP
892
                                "paddusb b5Dither, %%mm2        \n\t"
893
                                "paddusb g6Dither, %%mm4        \n\t"
894
                                "paddusb r5Dither, %%mm5        \n\t"
895
#endif
896

    
897
                                WRITEBGR16
898

    
899
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
900
                        "m" (yalpha1), "m" (uvalpha1)
901
                        : "%eax"
902
                        );
903
                }
904
#else
905
                asm volatile ("\n\t"::: "memory");
906

    
907
                if(dstbpp==32)
908
                {
909
                        int i;
910
                        for(i=0; i<dstw-1; i+=2){
911
                                // vertical linear interpolation && yuv2rgb in a single step:
912
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
913
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
914
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
915
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
916

    
917
                                int Cb= yuvtab_40cf[U];
918
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
919
                                int Cr= yuvtab_3343[V];
920

    
921
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
922
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
923
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
924

    
925
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
926
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
927
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
928
                        }
929
                }
930
                if(dstbpp==24)
931
                {
932
                        int i;
933
                        for(i=0; i<dstw-1; i+=2){
934
                                // vertical linear interpolation && yuv2rgb in a single step:
935
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
936
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
937
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
938
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
939

    
940
                                int Cb= yuvtab_40cf[U];
941
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
942
                                int Cr= yuvtab_3343[V];
943

    
944
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
945
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
946
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
947

    
948
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
949
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
950
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
951
                                dest+=6;
952
                        }
953
                }
954
                else if(dstbpp==16)
955
                {
956
                        int i;
957
                        for(i=0; i<dstw-1; i+=2){
958
                                // vertical linear interpolation && yuv2rgb in a single step:
959
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
960
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
961
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
962
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
963

    
964
                                int Cb= yuvtab_40cf[U];
965
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
966
                                int Cr= yuvtab_3343[V];
967

    
968
                                ((uint16_t*)dest)[i] =
969
                                        clip_table16b[(Y1 + Cb) >>13] |
970
                                        clip_table16g[(Y1 + Cg) >>13] |
971
                                        clip_table16r[(Y1 + Cr) >>13];
972

    
973
                                ((uint16_t*)dest)[i+1] =
974
                                        clip_table16b[(Y2 + Cb) >>13] |
975
                                        clip_table16g[(Y2 + Cg) >>13] |
976
                                        clip_table16r[(Y2 + Cr) >>13];
977
                        }
978
                }
979
                else if(dstbpp==15)
980
                {
981
                        int i;
982
                        for(i=0; i<dstw-1; i+=2){
983
                                // vertical linear interpolation && yuv2rgb in a single step:
984
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
985
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
986
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
987
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
988

    
989
                                int Cb= yuvtab_40cf[U];
990
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
991
                                int Cr= yuvtab_3343[V];
992

    
993
                                ((uint16_t*)dest)[i] =
994
                                        clip_table15b[(Y1 + Cb) >>13] |
995
                                        clip_table15g[(Y1 + Cg) >>13] |
996
                                        clip_table15r[(Y1 + Cr) >>13];
997

    
998
                                ((uint16_t*)dest)[i+1] =
999
                                        clip_table15b[(Y2 + Cb) >>13] |
1000
                                        clip_table15g[(Y2 + Cg) >>13] |
1001
                                        clip_table15r[(Y2 + Cr) >>13];
1002
                        }
1003
                }
1004
#endif
1005
        } //!FULL_UV_IPOL
1006
}
1007

    
1008
/**
1009
 * YV12 to RGB without scaling or interpolating
1010
 */
1011
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1012
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
1013
{
1014
        int uvalpha1=uvalpha^4095;
1015
#ifdef HAVE_MMX
1016
        int yalpha1=yalpha^4095;
1017
#endif
1018

    
1019
        if(fullUVIpol || allwaysIpol)
1020
        {
1021
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1022
                return;
1023
        }
1024
        if( yalpha > 2048 ) buf0 = buf1;
1025

    
1026
#ifdef HAVE_MMX
1027
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1028
        {
1029
                if(dstbpp == 32)
1030
                {
1031
                        asm volatile(
1032
                                YSCALEYUV2RGB1
1033
                                WRITEBGR32
1034
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1035
                        "m" (yalpha1), "m" (uvalpha1)
1036
                        : "%eax"
1037
                        );
1038
                }
1039
                else if(dstbpp==24)
1040
                {
1041
                        asm volatile(
1042
                                "movl %4, %%ebx                        \n\t"
1043
                                YSCALEYUV2RGB1
1044
                                WRITEBGR24
1045
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
1046
                        "m" (yalpha1), "m" (uvalpha1)
1047
                        : "%eax", "%ebx"
1048
                        );
1049
                }
1050
                else if(dstbpp==15)
1051
                {
1052
                        asm volatile(
1053
                                YSCALEYUV2RGB1
1054
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1055
#ifdef DITHER1XBPP
1056
                                "paddusb b5Dither, %%mm2        \n\t"
1057
                                "paddusb g5Dither, %%mm4        \n\t"
1058
                                "paddusb r5Dither, %%mm5        \n\t"
1059
#endif
1060
                                WRITEBGR15
1061
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1062
                        "m" (yalpha1), "m" (uvalpha1)
1063
                        : "%eax"
1064
                        );
1065
                }
1066
                else if(dstbpp==16)
1067
                {
1068
                        asm volatile(
1069
                                YSCALEYUV2RGB1
1070
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1071
#ifdef DITHER1XBPP
1072
                                "paddusb b5Dither, %%mm2        \n\t"
1073
                                "paddusb g6Dither, %%mm4        \n\t"
1074
                                "paddusb r5Dither, %%mm5        \n\t"
1075
#endif
1076

    
1077
                                WRITEBGR16
1078
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1079
                        "m" (yalpha1), "m" (uvalpha1)
1080
                        : "%eax"
1081
                        );
1082
                }
1083
        }
1084
        else
1085
        {
1086
                if(dstbpp == 32)
1087
                {
1088
                        asm volatile(
1089
                                YSCALEYUV2RGB1b
1090
                                WRITEBGR32
1091
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1092
                        "m" (yalpha1), "m" (uvalpha1)
1093
                        : "%eax"
1094
                        );
1095
                }
1096
                else if(dstbpp==24)
1097
                {
1098
                        asm volatile(
1099
                                "movl %4, %%ebx                        \n\t"
1100
                                YSCALEYUV2RGB1b
1101
                                WRITEBGR24
1102
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
1103
                        "m" (yalpha1), "m" (uvalpha1)
1104
                        : "%eax", "%ebx"
1105
                        );
1106
                }
1107
                else if(dstbpp==15)
1108
                {
1109
                        asm volatile(
1110
                                YSCALEYUV2RGB1b
1111
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1112
#ifdef DITHER1XBPP
1113
                                "paddusb b5Dither, %%mm2        \n\t"
1114
                                "paddusb g5Dither, %%mm4        \n\t"
1115
                                "paddusb r5Dither, %%mm5        \n\t"
1116
#endif
1117
                                WRITEBGR15
1118
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1119
                        "m" (yalpha1), "m" (uvalpha1)
1120
                        : "%eax"
1121
                        );
1122
                }
1123
                else if(dstbpp==16)
1124
                {
1125
                        asm volatile(
1126
                                YSCALEYUV2RGB1b
1127
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1128
#ifdef DITHER1XBPP
1129
                                "paddusb b5Dither, %%mm2        \n\t"
1130
                                "paddusb g6Dither, %%mm4        \n\t"
1131
                                "paddusb r5Dither, %%mm5        \n\t"
1132
#endif
1133

    
1134
                                WRITEBGR16
1135
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1136
                        "m" (yalpha1), "m" (uvalpha1)
1137
                        : "%eax"
1138
                        );
1139
                }
1140
        }
1141
#else
1142
//FIXME write 2 versions (for even & odd lines)
1143
        asm volatile ("\n\t"::: "memory");
1144

    
1145
        if(dstbpp==32)
1146
        {
1147
                int i;
1148
                for(i=0; i<dstw-1; i+=2){
1149
                        // vertical linear interpolation && yuv2rgb in a single step:
1150
                        int Y1=yuvtab_2568[buf0[i]>>7];
1151
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1152
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1153
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1154

    
1155
                        int Cb= yuvtab_40cf[U];
1156
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1157
                        int Cr= yuvtab_3343[V];
1158

    
1159
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1160
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1161
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1162

    
1163
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1164
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1165
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1166
                }
1167
        }
1168
        if(dstbpp==24)
1169
        {
1170
                int i;
1171
                for(i=0; i<dstw-1; i+=2){
1172
                        // vertical linear interpolation && yuv2rgb in a single step:
1173
                        int Y1=yuvtab_2568[buf0[i]>>7];
1174
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1175
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1176
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1177

    
1178
                        int Cb= yuvtab_40cf[U];
1179
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1180
                        int Cr= yuvtab_3343[V];
1181

    
1182
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1183
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1184
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1185

    
1186
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1187
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1188
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1189
                        dest+=6;
1190
                }
1191
        }
1192
        else if(dstbpp==16)
1193
        {
1194
                int i;
1195
                for(i=0; i<dstw-1; i+=2){
1196
                        // vertical linear interpolation && yuv2rgb in a single step:
1197
                        int Y1=yuvtab_2568[buf0[i]>>7];
1198
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1199
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1200
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1201

    
1202
                        int Cb= yuvtab_40cf[U];
1203
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1204
                        int Cr= yuvtab_3343[V];
1205

    
1206
                        ((uint16_t*)dest)[i] =
1207
                                clip_table16b[(Y1 + Cb) >>13] |
1208
                                clip_table16g[(Y1 + Cg) >>13] |
1209
                                clip_table16r[(Y1 + Cr) >>13];
1210

    
1211
                        ((uint16_t*)dest)[i+1] =
1212
                                clip_table16b[(Y2 + Cb) >>13] |
1213
                                clip_table16g[(Y2 + Cg) >>13] |
1214
                                clip_table16r[(Y2 + Cr) >>13];
1215
                }
1216
        }
1217
        else if(dstbpp==15)
1218
        {
1219
                int i;
1220
                for(i=0; i<dstw-1; i+=2){
1221
                        // vertical linear interpolation && yuv2rgb in a single step:
1222
                        int Y1=yuvtab_2568[buf0[i]>>7];
1223
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1224
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1225
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1226

    
1227
                        int Cb= yuvtab_40cf[U];
1228
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1229
                        int Cr= yuvtab_3343[V];
1230

    
1231
                        ((uint16_t*)dest)[i] =
1232
                                clip_table15b[(Y1 + Cb) >>13] |
1233
                                clip_table15g[(Y1 + Cg) >>13] |
1234
                                clip_table15r[(Y1 + Cr) >>13];
1235

    
1236
                        ((uint16_t*)dest)[i+1] =
1237
                                clip_table15b[(Y2 + Cb) >>13] |
1238
                                clip_table15g[(Y2 + Cg) >>13] |
1239
                                clip_table15r[(Y2 + Cr) >>13];
1240
                }
1241
        }
1242
#endif
1243
}
1244

    
1245

    
1246
static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
1247
{
1248
      // *** horizontal scale Y line to temp buffer
1249
#ifdef ARCH_X86
1250
#ifdef HAVE_MMX2
1251
        int i;
1252
        if(canMMX2BeUsed)
1253
        {
1254
                asm volatile(
1255
                        "pxor %%mm7, %%mm7                \n\t"
1256
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1257
                        "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1258
                        "punpcklwd %%mm6, %%mm6                \n\t"
1259
                        "punpcklwd %%mm6, %%mm6                \n\t"
1260
                        "movq %%mm6, %%mm2                \n\t"
1261
                        "psllq $16, %%mm2                \n\t"
1262
                        "paddw %%mm6, %%mm2                \n\t"
1263
                        "psllq $16, %%mm2                \n\t"
1264
                        "paddw %%mm6, %%mm2                \n\t"
1265
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFF
1266
                        "movq %%mm2, temp0                \n\t"
1267
                        "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1268
                        "punpcklwd %%mm6, %%mm6                \n\t"
1269
                        "punpcklwd %%mm6, %%mm6                \n\t"
1270
                        "xorl %%eax, %%eax                \n\t" // i
1271
                        "movl %0, %%esi                        \n\t" // src
1272
                        "movl %1, %%edi                        \n\t" // buf1
1273
                        "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1274
                        "xorl %%ecx, %%ecx                \n\t"
1275
                        "xorl %%ebx, %%ebx                \n\t"
1276
                        "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1277

    
1278
#define FUNNY_Y_CODE \
1279
                        PREFETCH" 1024(%%esi)                \n\t"\
1280
                        PREFETCH" 1056(%%esi)                \n\t"\
1281
                        PREFETCH" 1088(%%esi)                \n\t"\
1282
                        "call funnyYCode                \n\t"\
1283
                        "movq temp0, %%mm2                \n\t"\
1284
                        "xorl %%ecx, %%ecx                \n\t"
1285

    
1286
FUNNY_Y_CODE
1287
FUNNY_Y_CODE
1288
FUNNY_Y_CODE
1289
FUNNY_Y_CODE
1290
FUNNY_Y_CODE
1291
FUNNY_Y_CODE
1292
FUNNY_Y_CODE
1293
FUNNY_Y_CODE
1294

    
1295
                        :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1296
                        "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1297
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1298
                );
1299
                for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
1300
        }
1301
        else
1302
        {
1303
#endif
1304
        //NO MMX just normal asm ...
1305
        asm volatile(
1306
                "xorl %%eax, %%eax                \n\t" // i
1307
                "xorl %%ebx, %%ebx                \n\t" // xx
1308
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1309
                "1:                                \n\t"
1310
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
1311
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1312
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1313
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1314
                "shll $16, %%edi                \n\t"
1315
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1316
                "movl %1, %%edi                        \n\t"
1317
                "shrl $9, %%esi                        \n\t"
1318
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1319
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1320
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1321

    
1322
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
1323
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1324
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1325
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1326
                "shll $16, %%edi                \n\t"
1327
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1328
                "movl %1, %%edi                        \n\t"
1329
                "shrl $9, %%esi                        \n\t"
1330
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
1331
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1332
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1333

    
1334

    
1335
                "addl $2, %%eax                        \n\t"
1336
                "cmpl %2, %%eax                        \n\t"
1337
                " jb 1b                                \n\t"
1338

    
1339

    
1340
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1341
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1342
                );
1343
#ifdef HAVE_MMX2
1344
        } //if MMX2 cant be used
1345
#endif
1346
#else
1347
        int i;
1348
        unsigned int xpos=0;
1349
        for(i=0;i<dstWidth;i++)
1350
        {
1351
                register unsigned int xx=xpos>>16;
1352
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
1353
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1354
                xpos+=xInc;
1355
        }
1356
#endif
1357
}
1358

    
1359
inline static void hcscale(uint16_t *dst, int dstWidth,
1360
                                uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
1361
{
1362
#ifdef ARCH_X86
1363
#ifdef HAVE_MMX2
1364
        int i;
1365
        if(canMMX2BeUsed)
1366
        {
1367
                asm volatile(
1368
                "pxor %%mm7, %%mm7                \n\t"
1369
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1370
                "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1371
                "punpcklwd %%mm6, %%mm6                \n\t"
1372
                "punpcklwd %%mm6, %%mm6                \n\t"
1373
                "movq %%mm6, %%mm2                \n\t"
1374
                "psllq $16, %%mm2                \n\t"
1375
                "paddw %%mm6, %%mm2                \n\t"
1376
                "psllq $16, %%mm2                \n\t"
1377
                "paddw %%mm6, %%mm2                \n\t"
1378
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFFFF
1379
                "movq %%mm2, temp0                \n\t"
1380
                "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1381
                "punpcklwd %%mm6, %%mm6                \n\t"
1382
                "punpcklwd %%mm6, %%mm6                \n\t"
1383
                "xorl %%eax, %%eax                \n\t" // i
1384
                "movl %0, %%esi                        \n\t" // src
1385
                "movl %1, %%edi                        \n\t" // buf1
1386
                "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1387
                "xorl %%ecx, %%ecx                \n\t"
1388
                "xorl %%ebx, %%ebx                \n\t"
1389
                "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1390

    
1391
#define FUNNYUVCODE \
1392
                        PREFETCH" 1024(%%esi)                \n\t"\
1393
                        PREFETCH" 1056(%%esi)                \n\t"\
1394
                        PREFETCH" 1088(%%esi)                \n\t"\
1395
                        "call funnyUVCode                \n\t"\
1396
                        "movq temp0, %%mm2                \n\t"\
1397
                        "xorl %%ecx, %%ecx                \n\t"
1398

    
1399
FUNNYUVCODE
1400
FUNNYUVCODE
1401
FUNNYUVCODE
1402
FUNNYUVCODE
1403

    
1404
FUNNYUVCODE
1405
FUNNYUVCODE
1406
FUNNYUVCODE
1407
FUNNYUVCODE
1408
                "xorl %%eax, %%eax                \n\t" // i
1409
                "movl %6, %%esi                        \n\t" // src
1410
                "movl %1, %%edi                        \n\t" // buf1
1411
                "addl $4096, %%edi                \n\t"
1412

    
1413
FUNNYUVCODE
1414
FUNNYUVCODE
1415
FUNNYUVCODE
1416
FUNNYUVCODE
1417

    
1418
FUNNYUVCODE
1419
FUNNYUVCODE
1420
FUNNYUVCODE
1421
FUNNYUVCODE
1422

    
1423
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1424
                  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1425
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1426
        );
1427
                for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
1428
                {
1429
                        dst[i] = src1[srcWidth/2-1]*128;
1430
                        dst[i+2048] = src2[srcWidth/2-1]*128;
1431
                }
1432
        }
1433
        else
1434
        {
1435
#endif
1436
        asm volatile(
1437
                "xorl %%eax, %%eax                \n\t" // i
1438
                "xorl %%ebx, %%ebx                \n\t" // xx
1439
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1440
                "1:                                \n\t"
1441
                "movl %0, %%esi                        \n\t"
1442
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
1443
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
1444
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1445
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1446
                "shll $16, %%edi                \n\t"
1447
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1448
                "movl %1, %%edi                        \n\t"
1449
                "shrl $9, %%esi                        \n\t"
1450
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1451

    
1452
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
1453
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
1454
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1455
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1456
                "shll $16, %%edi                \n\t"
1457
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1458
                "movl %1, %%edi                        \n\t"
1459
                "shrl $9, %%esi                        \n\t"
1460
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1461

    
1462
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1463
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1464
                "addl $1, %%eax                        \n\t"
1465
                "cmpl %2, %%eax                        \n\t"
1466
                " jb 1b                                \n\t"
1467

    
1468
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1469
                "r" (src2)
1470
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1471
                );
1472
#ifdef HAVE_MMX2
1473
        } //if MMX2 cant be used
1474
#endif
1475
#else
1476
        int i;
1477
        unsigned int xpos=0;
1478
        for(i=0;i<dstWidth;i++)
1479
        {
1480
                register unsigned int xx=xpos>>16;
1481
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
1482
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1483
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1484
/* slower
1485
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1486
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1487
*/
1488
                xpos+=xInc;
1489
        }
1490
#endif
1491
}
1492

    
1493

    
1494
// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
1495
// *** Note: it's called multiple times while decoding a frame, first time y==0
1496
// *** Designed to upscale, but may work for downscale too.
1497
// s_xinc = (src_width << 16) / dst_width
1498
// s_yinc = (src_height << 16) / dst_height
1499
void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
1500
                             uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
1501
                             unsigned int s_xinc,unsigned int s_yinc){
1502

    
1503
// scaling factors:
1504
//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
1505
//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
1506

    
1507
unsigned int s_xinc2;
1508

    
1509
static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1510
static int s_ypos;
1511

    
1512
// last horzontally interpolated lines, used to avoid unnecessary calculations
1513
static int s_last_ypos;
1514
static int s_last_y1pos;
1515

    
1516
#ifdef HAVE_MMX2
1517
// used to detect a horizontal size change
1518
static int old_dstw= -1;
1519
static int old_s_xinc= -1;
1520
#endif
1521

    
1522
int srcWidth;
1523
int dstUVw;
1524
int i;
1525

    
1526
if(((dstw + 7)&(~7)) >= dststride) dstw&= ~7;
1527

    
1528
srcWidth= (dstw*s_xinc + 0x8000)>>16;
1529
dstUVw= fullUVIpol ? dstw : dstw/2;
1530

    
1531
#ifdef HAVE_MMX2
1532
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
1533
#endif
1534

    
1535
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1536
// n-2 is the last chrominance sample available
1537
// FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
1538
// would be like the vertical one, but that would require some special code for the
1539
// first and last pixel
1540
if(canMMX2BeUsed)         s_xinc+= 20;
1541
else                        s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
1542

    
1543
if(fullUVIpol && !(dstbpp==12))         s_xinc2= s_xinc>>1;
1544
else                                        s_xinc2= s_xinc;
1545
  // force calculation of the horizontal interpolation of the first line
1546

    
1547
  if(y==0){
1548
//        printf("dstw %d, srcw %d, mmx2 %d\n", dstw, srcWidth, canMMX2BeUsed);
1549
        s_last_ypos=-99;
1550
        s_last_y1pos=-99;
1551
        s_srcypos= s_yinc/2 - 0x8000;
1552
        s_ypos=0;
1553

    
1554
        // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0)
1555
        for(i=dstw-2; i<dstw+20; i++)
1556
        {
1557
                pix_buf_uv[0][i] = pix_buf_uv[1][i]
1558
                = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128*128;
1559
                pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2]
1560
                = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128;
1561
                pix_buf_y[0][i]= pix_buf_y[1][i]= 0;
1562
        }
1563

    
1564
#ifdef HAVE_MMX2
1565
// cant downscale !!!
1566
        if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
1567
        {
1568
                uint8_t *fragment;
1569
                int imm8OfPShufW1;
1570
                int imm8OfPShufW2;
1571
                int fragmentLength;
1572

    
1573
                int xpos, i;
1574

    
1575
                old_s_xinc= s_xinc;
1576
                old_dstw= dstw;
1577

    
1578
                // create an optimized horizontal scaling routine
1579

    
1580
                //code fragment
1581

    
1582
                asm volatile(
1583
                        "jmp 9f                                \n\t"
1584
                // Begin
1585
                        "0:                                \n\t"
1586
                        "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
1587
                        "movq %%mm0, %%mm1                \n\t"
1588
                        "psrlq $8, %%mm0                \n\t"
1589
                        "punpcklbw %%mm7, %%mm1        \n\t"
1590
                        "movq %%mm2, %%mm3                \n\t"
1591
                        "punpcklbw %%mm7, %%mm0        \n\t"
1592
                        "addw %%bx, %%cx                \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
1593
                        "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1594
                        "1:                                \n\t"
1595
                        "adcl %%edx, %%esi                \n\t" //xx+= (4*s_xinc)>>16 + carry
1596
                        "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1597
                        "2:                                \n\t"
1598
                        "psrlw $9, %%mm3                \n\t"
1599
                        "psubw %%mm1, %%mm0                \n\t"
1600
                        "pmullw %%mm3, %%mm0                \n\t"
1601
                        "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
1602
                        "psllw $7, %%mm1                \n\t"
1603
                        "paddw %%mm1, %%mm0                \n\t"
1604

    
1605
                        "movq %%mm0, (%%edi, %%eax)        \n\t"
1606

    
1607
                        "addl $8, %%eax                        \n\t"
1608
                // End
1609
                        "9:                                \n\t"
1610
//                "int $3\n\t"
1611
                        "leal 0b, %0                        \n\t"
1612
                        "leal 1b, %1                        \n\t"
1613
                        "leal 2b, %2                        \n\t"
1614
                        "decl %1                        \n\t"
1615
                        "decl %2                        \n\t"
1616
                        "subl %0, %1                        \n\t"
1617
                        "subl %0, %2                        \n\t"
1618
                        "leal 9b, %3                        \n\t"
1619
                        "subl %0, %3                        \n\t"
1620
                        :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1621
                         "=r" (fragmentLength)
1622
                );
1623

    
1624
                xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
1625

    
1626
                /* choose xinc so that all 8 parts fit exactly
1627
                   Note: we cannot use just 1 part because it would not fit in the code cache */
1628
//                s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
1629
//                s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
1630
#ifdef ALT_ERROR
1631
//                s_xinc2_diff+= ((0x10000/(dstw/8)));
1632
#endif
1633
//                s_xinc_diff= s_xinc2_diff*2;
1634

    
1635
//                s_xinc2+= s_xinc2_diff;
1636
//                s_xinc+= s_xinc_diff;
1637

    
1638
//                old_s_xinc= s_xinc;
1639

    
1640
                for(i=0; i<dstw/8; i++)
1641
                {
1642
                        int xx=xpos>>16;
1643

    
1644
                        if((i&3) == 0)
1645
                        {
1646
                                int a=0;
1647
                                int b=((xpos+s_xinc)>>16) - xx;
1648
                                int c=((xpos+s_xinc*2)>>16) - xx;
1649
                                int d=((xpos+s_xinc*3)>>16) - xx;
1650

    
1651
                                memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
1652

    
1653
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
1654
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
1655
                                        a | (b<<2) | (c<<4) | (d<<6);
1656

    
1657
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1658
                                // crossing a cache line
1659
                                if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1660

    
1661
                                funnyYCode[fragmentLength*(i+4)/4]= RET;
1662
                        }
1663
                        xpos+=s_xinc;
1664
                }
1665

    
1666
                xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
1667
                for(i=0; i<dstUVw/8; i++)
1668
                {
1669
                        int xx=xpos>>16;
1670

    
1671
                        if((i&3) == 0)
1672
                        {
1673
                                int a=0;
1674
                                int b=((xpos+s_xinc2)>>16) - xx;
1675
                                int c=((xpos+s_xinc2*2)>>16) - xx;
1676
                                int d=((xpos+s_xinc2*3)>>16) - xx;
1677

    
1678
                                memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
1679

    
1680
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
1681
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
1682
                                        a | (b<<2) | (c<<4) | (d<<6);
1683

    
1684
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1685
                                // crossing a cache line
1686
                                if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1687

    
1688
                                funnyUVCode[fragmentLength*(i+4)/4]= RET;
1689
                        }
1690
                        xpos+=s_xinc2;
1691
                }
1692
//                funnyCode[0]= RET;
1693
        }
1694

    
1695
#endif // HAVE_MMX2
1696
  } // reset counters
1697

    
1698
  while(1){
1699
    unsigned char *dest =dstptr[0]+dststride*s_ypos;
1700
    unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1);
1701
    unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1);
1702

    
1703
    int y0=(s_srcypos + 0xFFFF)>>16;  // first luminance source line number below the dst line
1704
        // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1705
    int srcuvpos= dstbpp==12 ?        s_srcypos + s_yinc/2 - 0x8000 :
1706
                                    s_srcypos - 0x8000;
1707
    int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
1708
    int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1709
    int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
1710
    uint16_t *buf0=pix_buf_y[y0&1];                // top line of the interpolated slice
1711
    uint16_t *buf1=pix_buf_y[((y0+1)&1)];        // bottom line of the interpolated slice
1712
    uint16_t *uvbuf0=pix_buf_uv[y1&1];                // top line of the interpolated slice
1713
    uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1];        // bottom line of the interpolated slice
1714

    
1715
    if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1716

    
1717
    if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line
1718

    
1719
    s_ypos++; s_srcypos+=s_yinc;
1720

    
1721
    //only interpolate the src line horizontally if we didnt do it allready
1722
        if(s_last_ypos!=y0)
1723
        {
1724
                unsigned char *src;
1725
                // skip if first line has been horiz scaled alleady
1726
                if(s_last_ypos != y0-1)
1727
                {
1728
                        // check if first line is before any available src lines
1729
                        if(y0-1 < y)         src=srcptr[0]+(0     )*stride[0];
1730
                        else                src=srcptr[0]+(y0-y-1)*stride[0];
1731

    
1732
                        hyscale(buf0, dstw, src, srcWidth, s_xinc);
1733
                }
1734
                // check if second line is after any available src lines
1735
                if(y0-y >= h)        src=srcptr[0]+(h-1)*stride[0];
1736
                else                src=srcptr[0]+(y0-y)*stride[0];
1737

    
1738
                // the min() is required to avoid reuseing lines which where not available
1739
                s_last_ypos= MIN(y0, y+h-1);
1740
                hyscale(buf1, dstw, src, srcWidth, s_xinc);
1741
        }
1742
//        printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
1743
      // *** horizontal scale U and V lines to temp buffer
1744
        if(s_last_y1pos!=y1)
1745
        {
1746
                uint8_t *src1, *src2;
1747
                // skip if first line has been horiz scaled alleady
1748
                if(s_last_y1pos != y1-1)
1749
                {
1750
                        // check if first line is before any available src lines
1751
                        if(y1-y/2-1 < 0)
1752
                        {
1753
                                src1= srcptr[1]+(0)*stride[1];
1754
                                src2= srcptr[2]+(0)*stride[2];
1755
                        }else{
1756
                                src1= srcptr[1]+(y1-y/2-1)*stride[1];
1757
                                src2= srcptr[2]+(y1-y/2-1)*stride[2];
1758
                        }
1759
                        hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
1760
                }
1761

    
1762
                // check if second line is after any available src lines
1763
                if(y1 - y/2 >= h/2)
1764
                {
1765
                        src1= srcptr[1]+(h/2-1)*stride[1];
1766
                        src2= srcptr[2]+(h/2-1)*stride[2];
1767
                }else{
1768
                        src1= srcptr[1]+(y1-y/2)*stride[1];
1769
                        src2= srcptr[2]+(y1-y/2)*stride[2];
1770
                }
1771
                hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
1772

    
1773
                // the min() is required to avoid reuseing lines which where not available
1774
                s_last_y1pos= MIN(y1, y/2+h/2-1);
1775
        }
1776
#ifdef HAVE_MMX
1777
        b5Dither= dither8[s_ypos&1];
1778
        g6Dither= dither4[s_ypos&1];
1779
        g5Dither= dither8[s_ypos&1];
1780
        r5Dither= dither8[(s_ypos+1)&1];
1781
#endif
1782

    
1783
        if(dstbpp==12) //YV12
1784
                yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha);
1785
        else if(ABS(s_yinc - 0x10000) < 10)
1786
                yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1787
        else
1788
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1789
  }
1790

    
1791
#ifdef HAVE_MMX
1792
        __asm __volatile(SFENCE:::"memory");
1793
        __asm __volatile(EMMS:::"memory");
1794
#endif
1795
}
1796

    
1797

    
1798
void SwScale_Init(){
1799
    // generating tables:
1800
    int i;
1801
    for(i=0;i<256;i++){
1802
        clip_table[i]=0;
1803
        clip_table[i+256]=i;
1804
        clip_table[i+512]=255;
1805
        yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
1806
        yuvtab_3343[i]=0x3343*(i-128);
1807
        yuvtab_0c92[i]=-0x0c92*(i-128);
1808
        yuvtab_1a1e[i]=-0x1a1e*(i-128);
1809
        yuvtab_40cf[i]=0x40cf*(i-128);
1810
    }
1811

    
1812
    for(i=0; i<768; i++)
1813
    {
1814
            int v= clip_table[i];
1815
        clip_table16b[i]= v>>3;
1816
        clip_table16g[i]= (v<<3)&0x07E0;
1817
        clip_table16r[i]= (v<<8)&0xF800;
1818
        clip_table15b[i]= v>>3;
1819
        clip_table15g[i]= (v<<2)&0x03E0;
1820
        clip_table15r[i]= (v<<7)&0x7C00;
1821
    }
1822

    
1823
}