Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ 6e3bba72

History | View | Annotate | Download (54.2 KB)

1

    
2
// Software scaling and colorspace conversion routines for MPlayer
3

    
4
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
// the parts written by michael are under GNU GPL
7

    
8
#include <inttypes.h>
9
#include <string.h>
10
#include "../config.h"
11
#include "swscale.h"
12
#include "../mmx_defs.h"
13
#undef MOVNTQ
14
#undef PAVGB
15

    
16
//#undef HAVE_MMX2
17
//#undef HAVE_MMX
18
//#undef ARCH_X86
19
#define DITHER1XBPP
20
int fullUVIpol=0;
21
//disables the unscaled height version
22
int allwaysIpol=0;
23

    
24
#define RET 0xC3 //near return opcode
25
/*
26
NOTES
27

28
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
29
horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
30

31
Supported output formats BGR15 BGR16 BGR24 BGR32
32
BGR15 & BGR16 MMX verions support dithering
33
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
34

35
TODO
36
more intelligent missalignment avoidance for the horizontal scaler
37
bicubic scaler
38
dither in C
39
change the distance of the u & v buffer
40
*/
41

    
42
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
43
#define MIN(a,b) ((a) > (b) ? (b) : (a))
44
#define MAX(a,b) ((a) < (b) ? (b) : (a))
45

    
46
#ifdef HAVE_MMX2
47
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
48
#elif defined (HAVE_3DNOW)
49
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
50
#endif
51

    
52
#ifdef HAVE_MMX2
53
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
54
#else
55
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
56
#endif
57

    
58

    
59
#ifdef HAVE_MMX
60
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
61
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
62
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
63
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
64
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
65
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
66
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
67
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
68
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
69
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
70
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
71
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
72
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
73

    
74
static volatile uint64_t __attribute__((aligned(8))) b5Dither;
75
static volatile uint64_t __attribute__((aligned(8))) g5Dither;
76
static volatile uint64_t __attribute__((aligned(8))) g6Dither;
77
static volatile uint64_t __attribute__((aligned(8))) r5Dither;
78

    
79
static uint64_t __attribute__((aligned(8))) dither4[2]={
80
        0x0103010301030103LL,
81
        0x0200020002000200LL,};
82

    
83
static uint64_t __attribute__((aligned(8))) dither8[2]={
84
        0x0602060206020602LL,
85
        0x0004000400040004LL,};
86

    
87
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
88
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
89
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
90
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
91
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
92
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
93

    
94
static uint64_t __attribute__((aligned(8))) M24A=   0x00FF0000FF0000FFLL;
95
static uint64_t __attribute__((aligned(8))) M24B=   0xFF0000FF0000FF00LL;
96
static uint64_t __attribute__((aligned(8))) M24C=   0x0000FF0000FF0000LL;
97

    
98
static uint64_t __attribute__((aligned(8))) temp0;
99
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
100
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
101
#endif
102

    
103
// temporary storage for 4 yuv lines:
104
// 16bit for now (mmx likes it more compact)
105
#ifdef HAVE_MMX
106
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
107
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
108
#else
109
static uint16_t pix_buf_y[4][2048];
110
static uint16_t pix_buf_uv[2][2048*2];
111
#endif
112

    
113
// clipping helper table for C implementations:
114
static unsigned char clip_table[768];
115

    
116
static unsigned short clip_table16b[768];
117
static unsigned short clip_table16g[768];
118
static unsigned short clip_table16r[768];
119
static unsigned short clip_table15b[768];
120
static unsigned short clip_table15g[768];
121
static unsigned short clip_table15r[768];
122

    
123
// yuv->rgb conversion tables:
124
static    int yuvtab_2568[256];
125
static    int yuvtab_3343[256];
126
static    int yuvtab_0c92[256];
127
static    int yuvtab_1a1e[256];
128
static    int yuvtab_40cf[256];
129

    
130
#ifdef HAVE_MMX2
131
static uint8_t funnyYCode[10000];
132
static uint8_t funnyUVCode[10000];
133
#endif
134

    
135
static int canMMX2BeUsed=0;
136

    
137
#define FULL_YSCALEYUV2RGB \
138
                "pxor %%mm7, %%mm7                \n\t"\
139
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
140
                "punpcklwd %%mm6, %%mm6                \n\t"\
141
                "punpcklwd %%mm6, %%mm6                \n\t"\
142
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
143
                "punpcklwd %%mm5, %%mm5                \n\t"\
144
                "punpcklwd %%mm5, %%mm5                \n\t"\
145
                "xorl %%eax, %%eax                \n\t"\
146
                ".align 16                        \n\t"\
147
                "1:                                \n\t"\
148
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
149
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
150
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
151
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
152
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
153
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
154
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
155
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
156
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
157
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
158
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
159
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
160
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
161
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
162
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
163
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
164
                "psubw w400, %%mm3                \n\t" /* 8(U-128)*/\
165
                "pmulhw yCoeff, %%mm1                \n\t"\
166
\
167
\
168
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
169
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
170
                "pmulhw ubCoeff, %%mm3                \n\t"\
171
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
172
                "pmulhw ugCoeff, %%mm2                \n\t"\
173
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
174
                "psubw w400, %%mm0                \n\t" /* (V-128)8*/\
175
\
176
\
177
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
178
                "pmulhw vrCoeff, %%mm0                \n\t"\
179
                "pmulhw vgCoeff, %%mm4                \n\t"\
180
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
181
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
182
                "packuswb %%mm3, %%mm3                \n\t"\
183
\
184
                "packuswb %%mm0, %%mm0                \n\t"\
185
                "paddw %%mm4, %%mm2                \n\t"\
186
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
187
\
188
                "packuswb %%mm1, %%mm1                \n\t"
189

    
190
#define YSCALEYUV2RGB \
191
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
192
                "punpcklwd %%mm6, %%mm6                \n\t"\
193
                "punpcklwd %%mm6, %%mm6                \n\t"\
194
                "movq %%mm6, asm_yalpha1        \n\t"\
195
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
196
                "punpcklwd %%mm5, %%mm5                \n\t"\
197
                "punpcklwd %%mm5, %%mm5                \n\t"\
198
                "movq %%mm5, asm_uvalpha1        \n\t"\
199
                "xorl %%eax, %%eax                \n\t"\
200
                ".align 16                        \n\t"\
201
                "1:                                \n\t"\
202
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
203
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
204
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
205
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
206
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
207
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
208
                "movq asm_uvalpha1, %%mm0        \n\t"\
209
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
210
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
211
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
212
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
213
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
214
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
215
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
216
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
217
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
218
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
219
                "pmulhw ugCoeff, %%mm3                \n\t"\
220
                "pmulhw vgCoeff, %%mm4                \n\t"\
221
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
222
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
223
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
224
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
225
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
226
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
227
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
228
                "pmulhw asm_yalpha1, %%mm0        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
229
                "pmulhw asm_yalpha1, %%mm6        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
230
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
231
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
232
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
233
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
234
                "pmulhw ubCoeff, %%mm2                \n\t"\
235
                "pmulhw vrCoeff, %%mm5                \n\t"\
236
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
237
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
238
                "pmulhw yCoeff, %%mm1                \n\t"\
239
                "pmulhw yCoeff, %%mm7                \n\t"\
240
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
241
                "paddw %%mm3, %%mm4                \n\t"\
242
                "movq %%mm2, %%mm0                \n\t"\
243
                "movq %%mm5, %%mm6                \n\t"\
244
                "movq %%mm4, %%mm3                \n\t"\
245
                "punpcklwd %%mm2, %%mm2                \n\t"\
246
                "punpcklwd %%mm5, %%mm5                \n\t"\
247
                "punpcklwd %%mm4, %%mm4                \n\t"\
248
                "paddw %%mm1, %%mm2                \n\t"\
249
                "paddw %%mm1, %%mm5                \n\t"\
250
                "paddw %%mm1, %%mm4                \n\t"\
251
                "punpckhwd %%mm0, %%mm0                \n\t"\
252
                "punpckhwd %%mm6, %%mm6                \n\t"\
253
                "punpckhwd %%mm3, %%mm3                \n\t"\
254
                "paddw %%mm7, %%mm0                \n\t"\
255
                "paddw %%mm7, %%mm6                \n\t"\
256
                "paddw %%mm7, %%mm3                \n\t"\
257
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
258
                "packuswb %%mm0, %%mm2                \n\t"\
259
                "packuswb %%mm6, %%mm5                \n\t"\
260
                "packuswb %%mm3, %%mm4                \n\t"\
261
                "pxor %%mm7, %%mm7                \n\t"
262

    
263
#define YSCALEYUV2RGB1 \
264
                "xorl %%eax, %%eax                \n\t"\
265
                ".align 16                        \n\t"\
266
                "1:                                \n\t"\
267
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
268
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
269
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
270
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
271
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
272
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
273
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
274
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
275
                "pmulhw ugCoeff, %%mm3                \n\t"\
276
                "pmulhw vgCoeff, %%mm4                \n\t"\
277
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
278
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
279
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
280
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
281
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
282
                "pmulhw ubCoeff, %%mm2                \n\t"\
283
                "pmulhw vrCoeff, %%mm5                \n\t"\
284
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
285
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
286
                "pmulhw yCoeff, %%mm1                \n\t"\
287
                "pmulhw yCoeff, %%mm7                \n\t"\
288
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289
                "paddw %%mm3, %%mm4                \n\t"\
290
                "movq %%mm2, %%mm0                \n\t"\
291
                "movq %%mm5, %%mm6                \n\t"\
292
                "movq %%mm4, %%mm3                \n\t"\
293
                "punpcklwd %%mm2, %%mm2                \n\t"\
294
                "punpcklwd %%mm5, %%mm5                \n\t"\
295
                "punpcklwd %%mm4, %%mm4                \n\t"\
296
                "paddw %%mm1, %%mm2                \n\t"\
297
                "paddw %%mm1, %%mm5                \n\t"\
298
                "paddw %%mm1, %%mm4                \n\t"\
299
                "punpckhwd %%mm0, %%mm0                \n\t"\
300
                "punpckhwd %%mm6, %%mm6                \n\t"\
301
                "punpckhwd %%mm3, %%mm3                \n\t"\
302
                "paddw %%mm7, %%mm0                \n\t"\
303
                "paddw %%mm7, %%mm6                \n\t"\
304
                "paddw %%mm7, %%mm3                \n\t"\
305
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306
                "packuswb %%mm0, %%mm2                \n\t"\
307
                "packuswb %%mm6, %%mm5                \n\t"\
308
                "packuswb %%mm3, %%mm4                \n\t"\
309
                "pxor %%mm7, %%mm7                \n\t"
310

    
311
// do vertical chrominance interpolation
312
#define YSCALEYUV2RGB1b \
313
                "xorl %%eax, %%eax                \n\t"\
314
                ".align 16                        \n\t"\
315
                "1:                                \n\t"\
316
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
317
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
318
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
319
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
320
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
321
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
322
                "psrlw $5, %%mm3                \n\t"\
323
                "psrlw $5, %%mm4                \n\t"\
324
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
325
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
326
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
327
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
328
                "pmulhw ugCoeff, %%mm3                \n\t"\
329
                "pmulhw vgCoeff, %%mm4                \n\t"\
330
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
331
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
332
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
333
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
334
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
335
                "pmulhw ubCoeff, %%mm2                \n\t"\
336
                "pmulhw vrCoeff, %%mm5                \n\t"\
337
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
338
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
339
                "pmulhw yCoeff, %%mm1                \n\t"\
340
                "pmulhw yCoeff, %%mm7                \n\t"\
341
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
342
                "paddw %%mm3, %%mm4                \n\t"\
343
                "movq %%mm2, %%mm0                \n\t"\
344
                "movq %%mm5, %%mm6                \n\t"\
345
                "movq %%mm4, %%mm3                \n\t"\
346
                "punpcklwd %%mm2, %%mm2                \n\t"\
347
                "punpcklwd %%mm5, %%mm5                \n\t"\
348
                "punpcklwd %%mm4, %%mm4                \n\t"\
349
                "paddw %%mm1, %%mm2                \n\t"\
350
                "paddw %%mm1, %%mm5                \n\t"\
351
                "paddw %%mm1, %%mm4                \n\t"\
352
                "punpckhwd %%mm0, %%mm0                \n\t"\
353
                "punpckhwd %%mm6, %%mm6                \n\t"\
354
                "punpckhwd %%mm3, %%mm3                \n\t"\
355
                "paddw %%mm7, %%mm0                \n\t"\
356
                "paddw %%mm7, %%mm6                \n\t"\
357
                "paddw %%mm7, %%mm3                \n\t"\
358
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
359
                "packuswb %%mm0, %%mm2                \n\t"\
360
                "packuswb %%mm6, %%mm5                \n\t"\
361
                "packuswb %%mm3, %%mm4                \n\t"\
362
                "pxor %%mm7, %%mm7                \n\t"
363

    
364
#define WRITEBGR32 \
365
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
366
                        "movq %%mm2, %%mm1                \n\t" /* B */\
367
                        "movq %%mm5, %%mm6                \n\t" /* R */\
368
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
369
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
370
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
371
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
372
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
373
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
374
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
375
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
376
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
377
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
378
\
379
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
380
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
381
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
382
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
383
\
384
                        "addl $8, %%eax                        \n\t"\
385
                        "cmpl %5, %%eax                        \n\t"\
386
                        " jb 1b                                \n\t"
387

    
388
#define WRITEBGR16 \
389
                        "pand bF8, %%mm2                \n\t" /* B */\
390
                        "pand bFC, %%mm4                \n\t" /* G */\
391
                        "pand bF8, %%mm5                \n\t" /* R */\
392
                        "psrlq $3, %%mm2                \n\t"\
393
\
394
                        "movq %%mm2, %%mm1                \n\t"\
395
                        "movq %%mm4, %%mm3                \n\t"\
396
\
397
                        "punpcklbw %%mm7, %%mm3                \n\t"\
398
                        "punpcklbw %%mm5, %%mm2                \n\t"\
399
                        "punpckhbw %%mm7, %%mm4                \n\t"\
400
                        "punpckhbw %%mm5, %%mm1                \n\t"\
401
\
402
                        "psllq $3, %%mm3                \n\t"\
403
                        "psllq $3, %%mm4                \n\t"\
404
\
405
                        "por %%mm3, %%mm2                \n\t"\
406
                        "por %%mm4, %%mm1                \n\t"\
407
\
408
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
409
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
410
\
411
                        "addl $8, %%eax                        \n\t"\
412
                        "cmpl %5, %%eax                        \n\t"\
413
                        " jb 1b                                \n\t"
414

    
415
#define WRITEBGR15 \
416
                        "pand bF8, %%mm2                \n\t" /* B */\
417
                        "pand bF8, %%mm4                \n\t" /* G */\
418
                        "pand bF8, %%mm5                \n\t" /* R */\
419
                        "psrlq $3, %%mm2                \n\t"\
420
                        "psrlq $1, %%mm5                \n\t"\
421
\
422
                        "movq %%mm2, %%mm1                \n\t"\
423
                        "movq %%mm4, %%mm3                \n\t"\
424
\
425
                        "punpcklbw %%mm7, %%mm3                \n\t"\
426
                        "punpcklbw %%mm5, %%mm2                \n\t"\
427
                        "punpckhbw %%mm7, %%mm4                \n\t"\
428
                        "punpckhbw %%mm5, %%mm1                \n\t"\
429
\
430
                        "psllq $2, %%mm3                \n\t"\
431
                        "psllq $2, %%mm4                \n\t"\
432
\
433
                        "por %%mm3, %%mm2                \n\t"\
434
                        "por %%mm4, %%mm1                \n\t"\
435
\
436
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
437
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
438
\
439
                        "addl $8, %%eax                        \n\t"\
440
                        "cmpl %5, %%eax                        \n\t"\
441
                        " jb 1b                                \n\t"
442

    
443
#define WRITEBGR24OLD \
444
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
445
                        "movq %%mm2, %%mm1                \n\t" /* B */\
446
                        "movq %%mm5, %%mm6                \n\t" /* R */\
447
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
448
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
449
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
450
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
451
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
452
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
453
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
454
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
455
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
456
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
457
\
458
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
459
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
460
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 0 */\
461
                        "pand bm11111000, %%mm0                \n\t" /* 00RGB000 0.5 */\
462
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
463
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
464
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
465
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
466
\
467
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
468
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
469
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
470
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
471
                        "pand bm00001111, %%mm2                \n\t" /* 0000RGBR 1 */\
472
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
473
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
474
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 2 */\
475
                        "pand bm11111000, %%mm1                \n\t" /* 00RGB000 2.5 */\
476
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
477
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
478
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
479
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
480
\
481
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
482
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
483
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
484
                        "pand bm00000111, %%mm5                \n\t" /* 00000RGB 3 */\
485
                        "pand bm11111000, %%mm3                \n\t" /* 00RGB000 3.5 */\
486
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
487
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
488
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
489
\
490
                        MOVNTQ(%%mm0, (%%ebx))\
491
                        MOVNTQ(%%mm2, 8(%%ebx))\
492
                        MOVNTQ(%%mm3, 16(%%ebx))\
493
                        "addl $24, %%ebx                \n\t"\
494
\
495
                        "addl $8, %%eax                        \n\t"\
496
                        "cmpl %5, %%eax                        \n\t"\
497
                        " jb 1b                                \n\t"
498

    
499
#define WRITEBGR24MMX \
500
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
501
                        "movq %%mm2, %%mm1                \n\t" /* B */\
502
                        "movq %%mm5, %%mm6                \n\t" /* R */\
503
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
504
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
505
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
506
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
507
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
508
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
509
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
510
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
511
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
512
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
513
\
514
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
515
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
516
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
517
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
518
\
519
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
520
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
521
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
522
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
523
\
524
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
525
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
526
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
527
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
528
\
529
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
530
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
531
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
532
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
533
                        MOVNTQ(%%mm0, (%%ebx))\
534
\
535
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
536
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
537
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
538
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
539
                        MOVNTQ(%%mm6, 8(%%ebx))\
540
\
541
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
542
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
543
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
544
                        MOVNTQ(%%mm5, 16(%%ebx))\
545
\
546
                        "addl $24, %%ebx                \n\t"\
547
\
548
                        "addl $8, %%eax                        \n\t"\
549
                        "cmpl %5, %%eax                        \n\t"\
550
                        " jb 1b                                \n\t"
551

    
552
#define WRITEBGR24MMX2 \
553
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
554
                        "movq M24A, %%mm0                \n\t"\
555
                        "movq M24C, %%mm7                \n\t"\
556
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
557
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
558
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
559
\
560
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
561
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
562
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
563
\
564
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
565
                        "por %%mm1, %%mm6                \n\t"\
566
                        "por %%mm3, %%mm6                \n\t"\
567
                        MOVNTQ(%%mm6, (%%ebx))\
568
\
569
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
570
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
571
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
572
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
573
\
574
                        "pand M24B, %%mm1                \n\t" /* B5       B4        B3    */\
575
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
576
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
577
\
578
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
579
                        "por %%mm3, %%mm6                \n\t"\
580
                        MOVNTQ(%%mm6, 8(%%ebx))\
581
\
582
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
583
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
584
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
585
\
586
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
587
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
588
                        "pand M24B, %%mm6                \n\t" /* R7       R6        R5    */\
589
\
590
                        "por %%mm1, %%mm3                \n\t"\
591
                        "por %%mm3, %%mm6                \n\t"\
592
                        MOVNTQ(%%mm6, 16(%%ebx))\
593
\
594
                        "addl $24, %%ebx                \n\t"\
595
\
596
                        "addl $8, %%eax                        \n\t"\
597
                        "cmpl %5, %%eax                        \n\t"\
598
                        " jb 1b                                \n\t"
599

    
600
#ifdef HAVE_MMX2
601
#define WRITEBGR24 WRITEBGR24MMX2
602
#else
603
#define WRITEBGR24 WRITEBGR24MMX
604
#endif
605

    
606
#ifdef HAVE_MMX
607
void in_asm_used_var_warning_killer()
608
{
609
 int i= yCoeff+vrCoeff+ubCoeff+vgCoeff+ugCoeff+bF8+bFC+w400+w80+w10+
610
 bm00001111+bm00000111+bm11111000+b16Mask+g16Mask+r16Mask+b15Mask+g15Mask+r15Mask+temp0+asm_yalpha1+ asm_uvalpha1+
611
 M24A+M24B+M24C;
612
 if(i) i=0;
613
}
614
#endif
615

    
616
static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
617
                           uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha)
618
{
619
        int yalpha1=yalpha^4095;
620
        int uvalpha1=uvalpha^4095;
621
        int i;
622

    
623
        asm volatile ("\n\t"::: "memory");
624

    
625
        for(i=0;i<dstw;i++)
626
        {
627
                ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19;
628
        }
629

    
630
        if(uvalpha != -1)
631
        {
632
                for(i=0; i<(dstw>>1); i++)
633
                {
634
                        ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19;
635
                        ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;
636
                }
637
        }
638
}
639

    
640
/**
641
 * vertical scale YV12 to RGB
642
 */
643
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
644
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
645
{
646
        int yalpha1=yalpha^4095;
647
        int uvalpha1=uvalpha^4095;
648

    
649
        if(fullUVIpol)
650
        {
651

    
652
#ifdef HAVE_MMX
653
                if(dstbpp == 32)
654
                {
655
                        asm volatile(
656

    
657

    
658
FULL_YSCALEYUV2RGB
659
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
660
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
661

    
662
                        "movq %%mm3, %%mm1                \n\t"
663
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
664
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
665

    
666
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
667
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
668

    
669
                        "addl $4, %%eax                        \n\t"
670
                        "cmpl %5, %%eax                        \n\t"
671
                        " jb 1b                                \n\t"
672

    
673

    
674
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
675
                        "m" (yalpha1), "m" (uvalpha1)
676
                        : "%eax"
677
                        );
678
                }
679
                else if(dstbpp==24)
680
                {
681
                        asm volatile(
682

    
683
FULL_YSCALEYUV2RGB
684

    
685
                                                                // lsb ... msb
686
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
687
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
688

    
689
                        "movq %%mm3, %%mm1                \n\t"
690
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
691
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
692

    
693
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
694
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
695
                        "pand bm00000111, %%mm2                \n\t" // BGR00000
696
                        "pand bm11111000, %%mm3                \n\t" // 000BGR00
697
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
698
                        "movq %%mm1, %%mm2                \n\t"
699
                        "psllq $48, %%mm1                \n\t" // 000000BG
700
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
701

    
702
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
703
                        "psrld $16, %%mm2                \n\t" // R000R000
704
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
705
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
706

    
707
                        "movl %4, %%ebx                        \n\t"
708
                        "addl %%eax, %%ebx                \n\t"
709

    
710
#ifdef HAVE_MMX2
711
                        //FIXME Alignment
712
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
713
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
714
#else
715
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
716
                        "psrlq $32, %%mm3                \n\t"
717
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
718
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
719
#endif
720
                        "addl $4, %%eax                        \n\t"
721
                        "cmpl %5, %%eax                        \n\t"
722
                        " jb 1b                                \n\t"
723

    
724
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
725
                        "m" (yalpha1), "m" (uvalpha1)
726
                        : "%eax", "%ebx"
727
                        );
728
                }
729
                else if(dstbpp==15)
730
                {
731
                        asm volatile(
732

    
733
FULL_YSCALEYUV2RGB
734
#ifdef DITHER1XBPP
735
                        "paddusb g5Dither, %%mm1        \n\t"
736
                        "paddusb r5Dither, %%mm0        \n\t"
737
                        "paddusb b5Dither, %%mm3        \n\t"
738
#endif
739
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
740
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
741
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
742

    
743
                        "psrlw $3, %%mm3                \n\t"
744
                        "psllw $2, %%mm1                \n\t"
745
                        "psllw $7, %%mm0                \n\t"
746
                        "pand g15Mask, %%mm1                \n\t"
747
                        "pand r15Mask, %%mm0                \n\t"
748

    
749
                        "por %%mm3, %%mm1                \n\t"
750
                        "por %%mm1, %%mm0                \n\t"
751

    
752
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
753

    
754
                        "addl $4, %%eax                        \n\t"
755
                        "cmpl %5, %%eax                        \n\t"
756
                        " jb 1b                                \n\t"
757

    
758
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
759
                        "m" (yalpha1), "m" (uvalpha1)
760
                        : "%eax"
761
                        );
762
                }
763
                else if(dstbpp==16)
764
                {
765
                        asm volatile(
766

    
767
FULL_YSCALEYUV2RGB
768
#ifdef DITHER1XBPP
769
                        "paddusb g6Dither, %%mm1        \n\t"
770
                        "paddusb r5Dither, %%mm0        \n\t"
771
                        "paddusb b5Dither, %%mm3        \n\t"
772
#endif
773
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
774
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
775
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
776

    
777
                        "psrlw $3, %%mm3                \n\t"
778
                        "psllw $3, %%mm1                \n\t"
779
                        "psllw $8, %%mm0                \n\t"
780
                        "pand g16Mask, %%mm1                \n\t"
781
                        "pand r16Mask, %%mm0                \n\t"
782

    
783
                        "por %%mm3, %%mm1                \n\t"
784
                        "por %%mm1, %%mm0                \n\t"
785

    
786
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
787

    
788
                        "addl $4, %%eax                        \n\t"
789
                        "cmpl %5, %%eax                        \n\t"
790
                        " jb 1b                                \n\t"
791

    
792
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
793
                        "m" (yalpha1), "m" (uvalpha1)
794
                        : "%eax"
795
                        );
796
                }
797
#else
798
                asm volatile ("\n\t"::: "memory");
799

    
800
                if(dstbpp==32 || dstbpp==24)
801
                {
802
                        int i;
803
                        for(i=0;i<dstw;i++){
804
                                // vertical linear interpolation && yuv2rgb in a single step:
805
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
806
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
807
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
808
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
809
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
810
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
811
                                dest+=dstbpp>>3;
812
                        }
813
                }
814
                else if(dstbpp==16)
815
                {
816
                        int i;
817
                        for(i=0;i<dstw;i++){
818
                                // vertical linear interpolation && yuv2rgb in a single step:
819
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
820
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
821
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
822

    
823
                                ((uint16_t*)dest)[i] =
824
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
825
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
826
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
827
                        }
828
                }
829
                else if(dstbpp==15)
830
                {
831
                        int i;
832
                        for(i=0;i<dstw;i++){
833
                                // vertical linear interpolation && yuv2rgb in a single step:
834
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
835
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
836
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
837

    
838
                                ((uint16_t*)dest)[i] =
839
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
840
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
841
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
842
                        }
843
                }
844
#endif
845
        }//FULL_UV_IPOL
846
        else
847
        {
848
#ifdef HAVE_MMX
849
                if(dstbpp == 32)
850
                {
851
                        asm volatile(
852
                                YSCALEYUV2RGB
853
                                WRITEBGR32
854

    
855
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
856
                        "m" (yalpha1), "m" (uvalpha1)
857
                        : "%eax"
858
                        );
859
                }
860
                else if(dstbpp==24)
861
                {
862
                        asm volatile(
863
                                "movl %4, %%ebx                        \n\t"
864
                                YSCALEYUV2RGB
865
                                WRITEBGR24
866

    
867
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
868
                        "m" (yalpha1), "m" (uvalpha1)
869
                        : "%eax", "%ebx"
870
                        );
871
                }
872
                else if(dstbpp==15)
873
                {
874
                        asm volatile(
875
                                YSCALEYUV2RGB
876
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
877
#ifdef DITHER1XBPP
878
                                "paddusb b5Dither, %%mm2        \n\t"
879
                                "paddusb g5Dither, %%mm4        \n\t"
880
                                "paddusb r5Dither, %%mm5        \n\t"
881
#endif
882

    
883
                                WRITEBGR15
884

    
885
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
886
                        "m" (yalpha1), "m" (uvalpha1)
887
                        : "%eax"
888
                        );
889
                }
890
                else if(dstbpp==16)
891
                {
892
                        asm volatile(
893
                                YSCALEYUV2RGB
894
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
895
#ifdef DITHER1XBPP
896
                                "paddusb b5Dither, %%mm2        \n\t"
897
                                "paddusb g6Dither, %%mm4        \n\t"
898
                                "paddusb r5Dither, %%mm5        \n\t"
899
#endif
900

    
901
                                WRITEBGR16
902

    
903
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
904
                        "m" (yalpha1), "m" (uvalpha1)
905
                        : "%eax"
906
                        );
907
                }
908
#else
909
                asm volatile ("\n\t"::: "memory");
910

    
911
                if(dstbpp==32)
912
                {
913
                        int i;
914
                        for(i=0; i<dstw-1; i+=2){
915
                                // vertical linear interpolation && yuv2rgb in a single step:
916
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
917
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
918
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
919
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
920

    
921
                                int Cb= yuvtab_40cf[U];
922
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
923
                                int Cr= yuvtab_3343[V];
924

    
925
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
926
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
927
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
928

    
929
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
930
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
931
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
932
                        }
933
                }
934
                if(dstbpp==24)
935
                {
936
                        int i;
937
                        for(i=0; i<dstw-1; i+=2){
938
                                // vertical linear interpolation && yuv2rgb in a single step:
939
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
940
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
941
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
942
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
943

    
944
                                int Cb= yuvtab_40cf[U];
945
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
946
                                int Cr= yuvtab_3343[V];
947

    
948
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
949
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
950
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
951

    
952
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
953
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
954
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
955
                                dest+=6;
956
                        }
957
                }
958
                else if(dstbpp==16)
959
                {
960
                        int i;
961
                        for(i=0; i<dstw-1; i+=2){
962
                                // vertical linear interpolation && yuv2rgb in a single step:
963
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
964
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
965
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
966
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
967

    
968
                                int Cb= yuvtab_40cf[U];
969
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
970
                                int Cr= yuvtab_3343[V];
971

    
972
                                ((uint16_t*)dest)[i] =
973
                                        clip_table16b[(Y1 + Cb) >>13] |
974
                                        clip_table16g[(Y1 + Cg) >>13] |
975
                                        clip_table16r[(Y1 + Cr) >>13];
976

    
977
                                ((uint16_t*)dest)[i+1] =
978
                                        clip_table16b[(Y2 + Cb) >>13] |
979
                                        clip_table16g[(Y2 + Cg) >>13] |
980
                                        clip_table16r[(Y2 + Cr) >>13];
981
                        }
982
                }
983
                else if(dstbpp==15)
984
                {
985
                        int i;
986
                        for(i=0; i<dstw-1; i+=2){
987
                                // vertical linear interpolation && yuv2rgb in a single step:
988
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
989
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
990
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
991
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
992

    
993
                                int Cb= yuvtab_40cf[U];
994
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
995
                                int Cr= yuvtab_3343[V];
996

    
997
                                ((uint16_t*)dest)[i] =
998
                                        clip_table15b[(Y1 + Cb) >>13] |
999
                                        clip_table15g[(Y1 + Cg) >>13] |
1000
                                        clip_table15r[(Y1 + Cr) >>13];
1001

    
1002
                                ((uint16_t*)dest)[i+1] =
1003
                                        clip_table15b[(Y2 + Cb) >>13] |
1004
                                        clip_table15g[(Y2 + Cg) >>13] |
1005
                                        clip_table15r[(Y2 + Cr) >>13];
1006
                        }
1007
                }
1008
#endif
1009
        } //!FULL_UV_IPOL
1010
}
1011

    
1012
/**
1013
 * YV12 to RGB without scaling or interpolating
1014
 */
1015
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1016
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
1017
{
1018
        int uvalpha1=uvalpha^4095;
1019
#ifdef HAVE_MMX
1020
        int yalpha1=yalpha^4095;
1021
#endif
1022

    
1023
        if(fullUVIpol || allwaysIpol)
1024
        {
1025
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1026
                return;
1027
        }
1028
        if( yalpha > 2048 ) buf0 = buf1;
1029

    
1030
#ifdef HAVE_MMX
1031
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1032
        {
1033
                if(dstbpp == 32)
1034
                {
1035
                        asm volatile(
1036
                                YSCALEYUV2RGB1
1037
                                WRITEBGR32
1038
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1039
                        "m" (yalpha1), "m" (uvalpha1)
1040
                        : "%eax"
1041
                        );
1042
                }
1043
                else if(dstbpp==24)
1044
                {
1045
                        asm volatile(
1046
                                "movl %4, %%ebx                        \n\t"
1047
                                YSCALEYUV2RGB1
1048
                                WRITEBGR24
1049
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
1050
                        "m" (yalpha1), "m" (uvalpha1)
1051
                        : "%eax", "%ebx"
1052
                        );
1053
                }
1054
                else if(dstbpp==15)
1055
                {
1056
                        asm volatile(
1057
                                YSCALEYUV2RGB1
1058
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059
#ifdef DITHER1XBPP
1060
                                "paddusb b5Dither, %%mm2        \n\t"
1061
                                "paddusb g5Dither, %%mm4        \n\t"
1062
                                "paddusb r5Dither, %%mm5        \n\t"
1063
#endif
1064
                                WRITEBGR15
1065
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1066
                        "m" (yalpha1), "m" (uvalpha1)
1067
                        : "%eax"
1068
                        );
1069
                }
1070
                else if(dstbpp==16)
1071
                {
1072
                        asm volatile(
1073
                                YSCALEYUV2RGB1
1074
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1075
#ifdef DITHER1XBPP
1076
                                "paddusb b5Dither, %%mm2        \n\t"
1077
                                "paddusb g6Dither, %%mm4        \n\t"
1078
                                "paddusb r5Dither, %%mm5        \n\t"
1079
#endif
1080

    
1081
                                WRITEBGR16
1082
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1083
                        "m" (yalpha1), "m" (uvalpha1)
1084
                        : "%eax"
1085
                        );
1086
                }
1087
        }
1088
        else
1089
        {
1090
                if(dstbpp == 32)
1091
                {
1092
                        asm volatile(
1093
                                YSCALEYUV2RGB1b
1094
                                WRITEBGR32
1095
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1096
                        "m" (yalpha1), "m" (uvalpha1)
1097
                        : "%eax"
1098
                        );
1099
                }
1100
                else if(dstbpp==24)
1101
                {
1102
                        asm volatile(
1103
                                "movl %4, %%ebx                        \n\t"
1104
                                YSCALEYUV2RGB1b
1105
                                WRITEBGR24
1106
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
1107
                        "m" (yalpha1), "m" (uvalpha1)
1108
                        : "%eax", "%ebx"
1109
                        );
1110
                }
1111
                else if(dstbpp==15)
1112
                {
1113
                        asm volatile(
1114
                                YSCALEYUV2RGB1b
1115
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1116
#ifdef DITHER1XBPP
1117
                                "paddusb b5Dither, %%mm2        \n\t"
1118
                                "paddusb g5Dither, %%mm4        \n\t"
1119
                                "paddusb r5Dither, %%mm5        \n\t"
1120
#endif
1121
                                WRITEBGR15
1122
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1123
                        "m" (yalpha1), "m" (uvalpha1)
1124
                        : "%eax"
1125
                        );
1126
                }
1127
                else if(dstbpp==16)
1128
                {
1129
                        asm volatile(
1130
                                YSCALEYUV2RGB1b
1131
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1132
#ifdef DITHER1XBPP
1133
                                "paddusb b5Dither, %%mm2        \n\t"
1134
                                "paddusb g6Dither, %%mm4        \n\t"
1135
                                "paddusb r5Dither, %%mm5        \n\t"
1136
#endif
1137

    
1138
                                WRITEBGR16
1139
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
1140
                        "m" (yalpha1), "m" (uvalpha1)
1141
                        : "%eax"
1142
                        );
1143
                }
1144
        }
1145
#else
1146
//FIXME write 2 versions (for even & odd lines)
1147
        asm volatile ("\n\t"::: "memory");
1148

    
1149
        if(dstbpp==32)
1150
        {
1151
                int i;
1152
                for(i=0; i<dstw-1; i+=2){
1153
                        // vertical linear interpolation && yuv2rgb in a single step:
1154
                        int Y1=yuvtab_2568[buf0[i]>>7];
1155
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1156
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1157
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1158

    
1159
                        int Cb= yuvtab_40cf[U];
1160
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1161
                        int Cr= yuvtab_3343[V];
1162

    
1163
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1164
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1165
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1166

    
1167
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1168
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1169
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1170
                }
1171
        }
1172
        if(dstbpp==24)
1173
        {
1174
                int i;
1175
                for(i=0; i<dstw-1; i+=2){
1176
                        // vertical linear interpolation && yuv2rgb in a single step:
1177
                        int Y1=yuvtab_2568[buf0[i]>>7];
1178
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1179
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1180
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1181

    
1182
                        int Cb= yuvtab_40cf[U];
1183
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1184
                        int Cr= yuvtab_3343[V];
1185

    
1186
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1187
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1188
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1189

    
1190
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1191
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1192
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1193
                        dest+=6;
1194
                }
1195
        }
1196
        else if(dstbpp==16)
1197
        {
1198
                int i;
1199
                for(i=0; i<dstw-1; i+=2){
1200
                        // vertical linear interpolation && yuv2rgb in a single step:
1201
                        int Y1=yuvtab_2568[buf0[i]>>7];
1202
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1203
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1204
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1205

    
1206
                        int Cb= yuvtab_40cf[U];
1207
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1208
                        int Cr= yuvtab_3343[V];
1209

    
1210
                        ((uint16_t*)dest)[i] =
1211
                                clip_table16b[(Y1 + Cb) >>13] |
1212
                                clip_table16g[(Y1 + Cg) >>13] |
1213
                                clip_table16r[(Y1 + Cr) >>13];
1214

    
1215
                        ((uint16_t*)dest)[i+1] =
1216
                                clip_table16b[(Y2 + Cb) >>13] |
1217
                                clip_table16g[(Y2 + Cg) >>13] |
1218
                                clip_table16r[(Y2 + Cr) >>13];
1219
                }
1220
        }
1221
        else if(dstbpp==15)
1222
        {
1223
                int i;
1224
                for(i=0; i<dstw-1; i+=2){
1225
                        // vertical linear interpolation && yuv2rgb in a single step:
1226
                        int Y1=yuvtab_2568[buf0[i]>>7];
1227
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1228
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1229
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1230

    
1231
                        int Cb= yuvtab_40cf[U];
1232
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1233
                        int Cr= yuvtab_3343[V];
1234

    
1235
                        ((uint16_t*)dest)[i] =
1236
                                clip_table15b[(Y1 + Cb) >>13] |
1237
                                clip_table15g[(Y1 + Cg) >>13] |
1238
                                clip_table15r[(Y1 + Cr) >>13];
1239

    
1240
                        ((uint16_t*)dest)[i+1] =
1241
                                clip_table15b[(Y2 + Cb) >>13] |
1242
                                clip_table15g[(Y2 + Cg) >>13] |
1243
                                clip_table15r[(Y2 + Cr) >>13];
1244
                }
1245
        }
1246
#endif
1247
}
1248

    
1249

    
1250
static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
1251
{
1252
      // *** horizontal scale Y line to temp buffer
1253
#ifdef ARCH_X86
1254
#ifdef HAVE_MMX2
1255
        int i;
1256
        if(canMMX2BeUsed)
1257
        {
1258
                asm volatile(
1259
                        "pxor %%mm7, %%mm7                \n\t"
1260
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1261
                        "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1262
                        "punpcklwd %%mm6, %%mm6                \n\t"
1263
                        "punpcklwd %%mm6, %%mm6                \n\t"
1264
                        "movq %%mm6, %%mm2                \n\t"
1265
                        "psllq $16, %%mm2                \n\t"
1266
                        "paddw %%mm6, %%mm2                \n\t"
1267
                        "psllq $16, %%mm2                \n\t"
1268
                        "paddw %%mm6, %%mm2                \n\t"
1269
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFF
1270
                        "movq %%mm2, temp0                \n\t"
1271
                        "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1272
                        "punpcklwd %%mm6, %%mm6                \n\t"
1273
                        "punpcklwd %%mm6, %%mm6                \n\t"
1274
                        "xorl %%eax, %%eax                \n\t" // i
1275
                        "movl %0, %%esi                        \n\t" // src
1276
                        "movl %1, %%edi                        \n\t" // buf1
1277
                        "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1278
                        "xorl %%ecx, %%ecx                \n\t"
1279
                        "xorl %%ebx, %%ebx                \n\t"
1280
                        "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1281

    
1282
#define FUNNY_Y_CODE \
1283
                        PREFETCH" 1024(%%esi)                \n\t"\
1284
                        PREFETCH" 1056(%%esi)                \n\t"\
1285
                        PREFETCH" 1088(%%esi)                \n\t"\
1286
                        "call funnyYCode                \n\t"\
1287
                        "movq temp0, %%mm2                \n\t"\
1288
                        "xorl %%ecx, %%ecx                \n\t"
1289

    
1290
FUNNY_Y_CODE
1291
FUNNY_Y_CODE
1292
FUNNY_Y_CODE
1293
FUNNY_Y_CODE
1294
FUNNY_Y_CODE
1295
FUNNY_Y_CODE
1296
FUNNY_Y_CODE
1297
FUNNY_Y_CODE
1298

    
1299
                        :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1300
                        "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1301
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1302
                );
1303
                for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
1304
        }
1305
        else
1306
        {
1307
#endif
1308
        //NO MMX just normal asm ...
1309
        asm volatile(
1310
                "xorl %%eax, %%eax                \n\t" // i
1311
                "xorl %%ebx, %%ebx                \n\t" // xx
1312
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1313
                ".align 16                        \n\t"
1314
                "1:                                \n\t"
1315
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
1316
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1317
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1318
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1319
                "shll $16, %%edi                \n\t"
1320
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1321
                "movl %1, %%edi                        \n\t"
1322
                "shrl $9, %%esi                        \n\t"
1323
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1324
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1325
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1326

    
1327
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
1328
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1329
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1330
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1331
                "shll $16, %%edi                \n\t"
1332
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1333
                "movl %1, %%edi                        \n\t"
1334
                "shrl $9, %%esi                        \n\t"
1335
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
1336
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1337
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1338

    
1339

    
1340
                "addl $2, %%eax                        \n\t"
1341
                "cmpl %2, %%eax                        \n\t"
1342
                " jb 1b                                \n\t"
1343

    
1344

    
1345
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1346
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1347
                );
1348
#ifdef HAVE_MMX2
1349
        } //if MMX2 cant be used
1350
#endif
1351
#else
1352
        int i;
1353
        unsigned int xpos=0;
1354
        for(i=0;i<dstWidth;i++)
1355
        {
1356
                register unsigned int xx=xpos>>16;
1357
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
1358
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1359
                xpos+=xInc;
1360
        }
1361
#endif
1362
}
1363

    
1364
inline static void hcscale(uint16_t *dst, int dstWidth,
1365
                                uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
1366
{
1367
#ifdef ARCH_X86
1368
#ifdef HAVE_MMX2
1369
        int i;
1370
        if(canMMX2BeUsed)
1371
        {
1372
                asm volatile(
1373
                "pxor %%mm7, %%mm7                \n\t"
1374
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1375
                "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1376
                "punpcklwd %%mm6, %%mm6                \n\t"
1377
                "punpcklwd %%mm6, %%mm6                \n\t"
1378
                "movq %%mm6, %%mm2                \n\t"
1379
                "psllq $16, %%mm2                \n\t"
1380
                "paddw %%mm6, %%mm2                \n\t"
1381
                "psllq $16, %%mm2                \n\t"
1382
                "paddw %%mm6, %%mm2                \n\t"
1383
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFFFF
1384
                "movq %%mm2, temp0                \n\t"
1385
                "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1386
                "punpcklwd %%mm6, %%mm6                \n\t"
1387
                "punpcklwd %%mm6, %%mm6                \n\t"
1388
                "xorl %%eax, %%eax                \n\t" // i
1389
                "movl %0, %%esi                        \n\t" // src
1390
                "movl %1, %%edi                        \n\t" // buf1
1391
                "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1392
                "xorl %%ecx, %%ecx                \n\t"
1393
                "xorl %%ebx, %%ebx                \n\t"
1394
                "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1395

    
1396
#define FUNNYUVCODE \
1397
                        PREFETCH" 1024(%%esi)                \n\t"\
1398
                        PREFETCH" 1056(%%esi)                \n\t"\
1399
                        PREFETCH" 1088(%%esi)                \n\t"\
1400
                        "call funnyUVCode                \n\t"\
1401
                        "movq temp0, %%mm2                \n\t"\
1402
                        "xorl %%ecx, %%ecx                \n\t"
1403

    
1404
FUNNYUVCODE
1405
FUNNYUVCODE
1406
FUNNYUVCODE
1407
FUNNYUVCODE
1408

    
1409
FUNNYUVCODE
1410
FUNNYUVCODE
1411
FUNNYUVCODE
1412
FUNNYUVCODE
1413
                "xorl %%eax, %%eax                \n\t" // i
1414
                "movl %6, %%esi                        \n\t" // src
1415
                "movl %1, %%edi                        \n\t" // buf1
1416
                "addl $4096, %%edi                \n\t"
1417

    
1418
FUNNYUVCODE
1419
FUNNYUVCODE
1420
FUNNYUVCODE
1421
FUNNYUVCODE
1422

    
1423
FUNNYUVCODE
1424
FUNNYUVCODE
1425
FUNNYUVCODE
1426
FUNNYUVCODE
1427

    
1428
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1429
                  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1430
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1431
        );
1432
                for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
1433
                {
1434
                        dst[i] = src1[srcWidth/2-1]*128;
1435
                        dst[i+2048] = src2[srcWidth/2-1]*128;
1436
                }
1437
        }
1438
        else
1439
        {
1440
#endif
1441
        asm volatile(
1442
                "xorl %%eax, %%eax                \n\t" // i
1443
                "xorl %%ebx, %%ebx                \n\t" // xx
1444
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1445
                ".align 16                        \n\t"
1446
                "1:                                \n\t"
1447
                "movl %0, %%esi                        \n\t"
1448
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
1449
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
1450
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1451
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1452
                "shll $16, %%edi                \n\t"
1453
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1454
                "movl %1, %%edi                        \n\t"
1455
                "shrl $9, %%esi                        \n\t"
1456
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1457

    
1458
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
1459
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
1460
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1461
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1462
                "shll $16, %%edi                \n\t"
1463
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1464
                "movl %1, %%edi                        \n\t"
1465
                "shrl $9, %%esi                        \n\t"
1466
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1467

    
1468
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1469
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1470
                "addl $1, %%eax                        \n\t"
1471
                "cmpl %2, %%eax                        \n\t"
1472
                " jb 1b                                \n\t"
1473

    
1474
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1475
                "r" (src2)
1476
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1477
                );
1478
#ifdef HAVE_MMX2
1479
        } //if MMX2 cant be used
1480
#endif
1481
#else
1482
        int i;
1483
        unsigned int xpos=0;
1484
        for(i=0;i<dstWidth;i++)
1485
        {
1486
                register unsigned int xx=xpos>>16;
1487
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
1488
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1489
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1490
/* slower
1491
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1492
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1493
*/
1494
                xpos+=xInc;
1495
        }
1496
#endif
1497
}
1498

    
1499

    
1500
// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
1501
// *** Note: it's called multiple times while decoding a frame, first time y==0
1502
// *** Designed to upscale, but may work for downscale too.
1503
// s_xinc = (src_width << 16) / dst_width
1504
// s_yinc = (src_height << 16) / dst_height
1505
void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
1506
                             uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
1507
                             unsigned int s_xinc,unsigned int s_yinc){
1508

    
1509
// scaling factors:
1510
//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
1511
//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
1512

    
1513
unsigned int s_xinc2;
1514

    
1515
static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1516
static int s_ypos;
1517

    
1518
// last horzontally interpolated lines, used to avoid unnecessary calculations
1519
static int s_last_ypos;
1520
static int s_last_y1pos;
1521

    
1522
#ifdef HAVE_MMX2
1523
// used to detect a horizontal size change
1524
static int old_dstw= -1;
1525
static int old_s_xinc= -1;
1526
#endif
1527

    
1528
int srcWidth;
1529
int dstUVw;
1530
int i;
1531

    
1532
if(((dstw + 7)&(~7)) >= dststride) dstw&= ~7;
1533

    
1534
srcWidth= (dstw*s_xinc + 0x8000)>>16;
1535
dstUVw= fullUVIpol ? dstw : dstw/2;
1536

    
1537
#ifdef HAVE_MMX2
1538
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
1539
#endif
1540

    
1541
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1542
// n-2 is the last chrominance sample available
1543
// FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
1544
// would be like the vertical one, but that would require some special code for the
1545
// first and last pixel
1546
if(canMMX2BeUsed)         s_xinc+= 20;
1547
else                        s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
1548

    
1549
if(fullUVIpol && !(dstbpp==12))         s_xinc2= s_xinc>>1;
1550
else                                        s_xinc2= s_xinc;
1551
  // force calculation of the horizontal interpolation of the first line
1552

    
1553
  if(y==0){
1554
//        printf("dstw %d, srcw %d, mmx2 %d\n", dstw, srcWidth, canMMX2BeUsed);
1555
        s_last_ypos=-99;
1556
        s_last_y1pos=-99;
1557
        s_srcypos= s_yinc/2 - 0x8000;
1558
        s_ypos=0;
1559

    
1560
        // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0)
1561
        for(i=dstw-2; i<dstw+20; i++)
1562
        {
1563
                pix_buf_uv[0][i] = pix_buf_uv[1][i]
1564
                = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128*128;
1565
                pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2]
1566
                = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128*128;
1567
                pix_buf_y[0][i]= pix_buf_y[1][i]= 0;
1568
        }
1569

    
1570
#ifdef HAVE_MMX2
1571
// cant downscale !!!
1572
        if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
1573
        {
1574
                uint8_t *fragment;
1575
                int imm8OfPShufW1;
1576
                int imm8OfPShufW2;
1577
                int fragmentLength;
1578

    
1579
                int xpos, i;
1580

    
1581
                old_s_xinc= s_xinc;
1582
                old_dstw= dstw;
1583

    
1584
                // create an optimized horizontal scaling routine
1585

    
1586
                //code fragment
1587

    
1588
                asm volatile(
1589
                        "jmp 9f                                \n\t"
1590
                // Begin
1591
                        "0:                                \n\t"
1592
                        "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
1593
                        "movq %%mm0, %%mm1                \n\t"
1594
                        "psrlq $8, %%mm0                \n\t"
1595
                        "punpcklbw %%mm7, %%mm1        \n\t"
1596
                        "movq %%mm2, %%mm3                \n\t"
1597
                        "punpcklbw %%mm7, %%mm0        \n\t"
1598
                        "addw %%bx, %%cx                \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
1599
                        "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1600
                        "1:                                \n\t"
1601
                        "adcl %%edx, %%esi                \n\t" //xx+= (4*s_xinc)>>16 + carry
1602
                        "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1603
                        "2:                                \n\t"
1604
                        "psrlw $9, %%mm3                \n\t"
1605
                        "psubw %%mm1, %%mm0                \n\t"
1606
                        "pmullw %%mm3, %%mm0                \n\t"
1607
                        "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
1608
                        "psllw $7, %%mm1                \n\t"
1609
                        "paddw %%mm1, %%mm0                \n\t"
1610

    
1611
                        "movq %%mm0, (%%edi, %%eax)        \n\t"
1612

    
1613
                        "addl $8, %%eax                        \n\t"
1614
                // End
1615
                        "9:                                \n\t"
1616
//                "int $3\n\t"
1617
                        "leal 0b, %0                        \n\t"
1618
                        "leal 1b, %1                        \n\t"
1619
                        "leal 2b, %2                        \n\t"
1620
                        "decl %1                        \n\t"
1621
                        "decl %2                        \n\t"
1622
                        "subl %0, %1                        \n\t"
1623
                        "subl %0, %2                        \n\t"
1624
                        "leal 9b, %3                        \n\t"
1625
                        "subl %0, %3                        \n\t"
1626
                        :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1627
                         "=r" (fragmentLength)
1628
                );
1629

    
1630
                xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
1631

    
1632
                /* choose xinc so that all 8 parts fit exactly
1633
                   Note: we cannot use just 1 part because it would not fit in the code cache */
1634
//                s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
1635
//                s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
1636
#ifdef ALT_ERROR
1637
//                s_xinc2_diff+= ((0x10000/(dstw/8)));
1638
#endif
1639
//                s_xinc_diff= s_xinc2_diff*2;
1640

    
1641
//                s_xinc2+= s_xinc2_diff;
1642
//                s_xinc+= s_xinc_diff;
1643

    
1644
//                old_s_xinc= s_xinc;
1645

    
1646
                for(i=0; i<dstw/8; i++)
1647
                {
1648
                        int xx=xpos>>16;
1649

    
1650
                        if((i&3) == 0)
1651
                        {
1652
                                int a=0;
1653
                                int b=((xpos+s_xinc)>>16) - xx;
1654
                                int c=((xpos+s_xinc*2)>>16) - xx;
1655
                                int d=((xpos+s_xinc*3)>>16) - xx;
1656

    
1657
                                memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
1658

    
1659
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
1660
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
1661
                                        a | (b<<2) | (c<<4) | (d<<6);
1662

    
1663
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1664
                                // crossing a cache line
1665
                                if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1666

    
1667
                                funnyYCode[fragmentLength*(i+4)/4]= RET;
1668
                        }
1669
                        xpos+=s_xinc;
1670
                }
1671

    
1672
                xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
1673
                for(i=0; i<dstUVw/8; i++)
1674
                {
1675
                        int xx=xpos>>16;
1676

    
1677
                        if((i&3) == 0)
1678
                        {
1679
                                int a=0;
1680
                                int b=((xpos+s_xinc2)>>16) - xx;
1681
                                int c=((xpos+s_xinc2*2)>>16) - xx;
1682
                                int d=((xpos+s_xinc2*3)>>16) - xx;
1683

    
1684
                                memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
1685

    
1686
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
1687
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
1688
                                        a | (b<<2) | (c<<4) | (d<<6);
1689

    
1690
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1691
                                // crossing a cache line
1692
                                if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1693

    
1694
                                funnyUVCode[fragmentLength*(i+4)/4]= RET;
1695
                        }
1696
                        xpos+=s_xinc2;
1697
                }
1698
//                funnyCode[0]= RET;
1699
        }
1700

    
1701
#endif // HAVE_MMX2
1702
  } // reset counters
1703

    
1704
  while(1){
1705
    unsigned char *dest =dstptr[0]+dststride*s_ypos;
1706
    unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1);
1707
    unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1);
1708

    
1709
    int y0=(s_srcypos + 0xFFFF)>>16;  // first luminance source line number below the dst line
1710
        // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1711
    int srcuvpos= dstbpp==12 ?        s_srcypos + s_yinc/2 - 0x8000 :
1712
                                    s_srcypos - 0x8000;
1713
    int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
1714
    int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1715
    int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
1716
    uint16_t *buf0=pix_buf_y[y0&1];                // top line of the interpolated slice
1717
    uint16_t *buf1=pix_buf_y[((y0+1)&1)];        // bottom line of the interpolated slice
1718
    uint16_t *uvbuf0=pix_buf_uv[y1&1];                // top line of the interpolated slice
1719
    uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1];        // bottom line of the interpolated slice
1720

    
1721
    if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1722

    
1723
    if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line
1724

    
1725
    s_ypos++; s_srcypos+=s_yinc;
1726

    
1727
    //only interpolate the src line horizontally if we didnt do it allready
1728
        if(s_last_ypos!=y0)
1729
        {
1730
                unsigned char *src;
1731
                // skip if first line has been horiz scaled alleady
1732
                if(s_last_ypos != y0-1)
1733
                {
1734
                        // check if first line is before any available src lines
1735
                        if(y0-1 < y)         src=srcptr[0]+(0     )*stride[0];
1736
                        else                src=srcptr[0]+(y0-y-1)*stride[0];
1737

    
1738
                        hyscale(buf0, dstw, src, srcWidth, s_xinc);
1739
                }
1740
                // check if second line is after any available src lines
1741
                if(y0-y >= h)        src=srcptr[0]+(h-1)*stride[0];
1742
                else                src=srcptr[0]+(y0-y)*stride[0];
1743

    
1744
                // the min() is required to avoid reuseing lines which where not available
1745
                s_last_ypos= MIN(y0, y+h-1);
1746
                hyscale(buf1, dstw, src, srcWidth, s_xinc);
1747
        }
1748
//        printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
1749
      // *** horizontal scale U and V lines to temp buffer
1750
        if(s_last_y1pos!=y1)
1751
        {
1752
                uint8_t *src1, *src2;
1753
                // skip if first line has been horiz scaled alleady
1754
                if(s_last_y1pos != y1-1)
1755
                {
1756
                        // check if first line is before any available src lines
1757
                        if(y1-y/2-1 < 0)
1758
                        {
1759
                                src1= srcptr[1]+(0)*stride[1];
1760
                                src2= srcptr[2]+(0)*stride[2];
1761
                        }else{
1762
                                src1= srcptr[1]+(y1-y/2-1)*stride[1];
1763
                                src2= srcptr[2]+(y1-y/2-1)*stride[2];
1764
                        }
1765
                        hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
1766
                }
1767

    
1768
                // check if second line is after any available src lines
1769
                if(y1 - y/2 >= h/2)
1770
                {
1771
                        src1= srcptr[1]+(h/2-1)*stride[1];
1772
                        src2= srcptr[2]+(h/2-1)*stride[2];
1773
                }else{
1774
                        src1= srcptr[1]+(y1-y/2)*stride[1];
1775
                        src2= srcptr[2]+(y1-y/2)*stride[2];
1776
                }
1777
                hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
1778

    
1779
                // the min() is required to avoid reuseing lines which where not available
1780
                s_last_y1pos= MIN(y1, y/2+h/2-1);
1781
        }
1782
#ifdef HAVE_MMX
1783
        b5Dither= dither8[s_ypos&1];
1784
        g6Dither= dither4[s_ypos&1];
1785
        g5Dither= dither8[s_ypos&1];
1786
        r5Dither= dither8[(s_ypos+1)&1];
1787
#endif
1788

    
1789
        if(dstbpp==12) //YV12
1790
                yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha);
1791
        else if(ABS(s_yinc - 0x10000) < 10)
1792
                yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1793
        else
1794
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1795
  }
1796

    
1797
#ifdef HAVE_MMX
1798
        __asm __volatile(SFENCE:::"memory");
1799
        __asm __volatile(EMMS:::"memory");
1800
#endif
1801
}
1802

    
1803

    
1804
void SwScale_Init(){
1805
    // generating tables:
1806
    int i;
1807
    for(i=0;i<256;i++){
1808
        clip_table[i]=0;
1809
        clip_table[i+256]=i;
1810
        clip_table[i+512]=255;
1811
        yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
1812
        yuvtab_3343[i]=0x3343*(i-128);
1813
        yuvtab_0c92[i]=-0x0c92*(i-128);
1814
        yuvtab_1a1e[i]=-0x1a1e*(i-128);
1815
        yuvtab_40cf[i]=0x40cf*(i-128);
1816
    }
1817

    
1818
    for(i=0; i<768; i++)
1819
    {
1820
            int v= clip_table[i];
1821
        clip_table16b[i]= v>>3;
1822
        clip_table16g[i]= (v<<3)&0x07E0;
1823
        clip_table16r[i]= (v<<8)&0xF800;
1824
        clip_table15b[i]= v>>3;
1825
        clip_table15g[i]= (v<<2)&0x03E0;
1826
        clip_table15r[i]= (v<<7)&0x7C00;
1827
    }
1828

    
1829
}