Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale.c @ f62255fb

History | View | Annotate | Download (49.7 KB)

1

    
2
// Software scaling and colorspace conversion routines for MPlayer
3

    
4
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
// the parts written by michael are under GNU GPL
7

    
8
#include <inttypes.h>
9
#include <string.h>
10
#include "../config.h"
11
#include "swscale.h"
12
#include "../mmx_defs.h"
13
#undef MOVNTQ
14

    
15
//#undef HAVE_MMX2
16
//#undef HAVE_MMX
17
//#undef ARCH_X86
18
#define DITHER1XBPP
19
int fullUVIpol=0;
20
//disables the unscaled height version
21
int allwaysIpol=0;
22

    
23
#define RET 0xC3 //near return opcode
24
/*
25
NOTES
26

27
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
28
horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
29

30
Supported output formats BGR15 BGR16 BGR24 BGR32
31
BGR15 & BGR16 MMX verions support dithering
32
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
33

34
TODO
35
more intelligent missalignment avoidance for the horizontal scaler
36
bicubic scaler
37
dither in C
38
change the distance of the u & v buffer
39
*/
40

    
41
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
42
#define MIN(a,b) ((a) > (b) ? (b) : (a))
43
#define MAX(a,b) ((a) < (b) ? (b) : (a))
44

    
45
#ifdef HAVE_MMX2
46
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
47
#elif defined (HAVE_3DNOW)
48
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
49
#endif
50

    
51
#ifdef HAVE_MMX2
52
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
53
#else
54
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
55
#endif
56

    
57

    
58
#ifdef HAVE_MMX
59
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
60
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x3343334333433343LL;
61
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x40cf40cf40cf40cfLL;
62
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xE5E2E5E2E5E2E5E2LL;
63
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xF36EF36EF36EF36ELL;
64
static uint64_t __attribute__((aligned(8))) bF8=       0xF8F8F8F8F8F8F8F8LL;
65
static uint64_t __attribute__((aligned(8))) bFC=       0xFCFCFCFCFCFCFCFCLL;
66
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
67
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
68
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
69
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
70
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
71
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
72

    
73
static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;
74
static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;
75
static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;
76
static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;
77
static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;
78
static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;
79

    
80
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
81
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
82
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
83
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
84
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
85
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
86

    
87
static uint64_t __attribute__((aligned(8))) temp0;
88
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
89
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
90
#endif
91

    
92
// temporary storage for 4 yuv lines:
93
// 16bit for now (mmx likes it more compact)
94
#ifdef HAVE_MMX
95
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
96
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
97
#else
98
static uint16_t pix_buf_y[4][2048];
99
static uint16_t pix_buf_uv[2][2048*2];
100
#endif
101

    
102
// clipping helper table for C implementations:
103
static unsigned char clip_table[768];
104

    
105
static unsigned short clip_table16b[768];
106
static unsigned short clip_table16g[768];
107
static unsigned short clip_table16r[768];
108
static unsigned short clip_table15b[768];
109
static unsigned short clip_table15g[768];
110
static unsigned short clip_table15r[768];
111

    
112
// yuv->rgb conversion tables:
113
static    int yuvtab_2568[256];
114
static    int yuvtab_3343[256];
115
static    int yuvtab_0c92[256];
116
static    int yuvtab_1a1e[256];
117
static    int yuvtab_40cf[256];
118

    
119

    
120
static uint8_t funnyYCode[10000];
121
static uint8_t funnyUVCode[10000];
122

    
123
static int canMMX2BeUsed=0;
124

    
125
#define FULL_YSCALEYUV2RGB \
126
                "pxor %%mm7, %%mm7                \n\t"\
127
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
128
                "punpcklwd %%mm6, %%mm6                \n\t"\
129
                "punpcklwd %%mm6, %%mm6                \n\t"\
130
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
131
                "punpcklwd %%mm5, %%mm5                \n\t"\
132
                "punpcklwd %%mm5, %%mm5                \n\t"\
133
                "xorl %%eax, %%eax                \n\t"\
134
                "1:                                \n\t"\
135
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
136
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
137
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
138
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
139
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
140
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
141
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
142
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
143
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
144
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
145
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
146
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
147
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
148
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
149
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
150
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
151
                "psubw w400, %%mm3                \n\t" /* 8(U-128)*/\
152
                "pmulhw yCoeff, %%mm1                \n\t"\
153
\
154
\
155
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
156
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
157
                "pmulhw ubCoeff, %%mm3                \n\t"\
158
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
159
                "pmulhw ugCoeff, %%mm2                \n\t"\
160
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
161
                "psubw w400, %%mm0                \n\t" /* (V-128)8*/\
162
\
163
\
164
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
165
                "pmulhw vrCoeff, %%mm0                \n\t"\
166
                "pmulhw vgCoeff, %%mm4                \n\t"\
167
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
168
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
169
                "packuswb %%mm3, %%mm3                \n\t"\
170
\
171
                "packuswb %%mm0, %%mm0                \n\t"\
172
                "paddw %%mm4, %%mm2                \n\t"\
173
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
174
\
175
                "packuswb %%mm1, %%mm1                \n\t"
176

    
177
#define YSCALEYUV2RGB \
178
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
179
                "punpcklwd %%mm6, %%mm6                \n\t"\
180
                "punpcklwd %%mm6, %%mm6                \n\t"\
181
                "movq %%mm6, asm_yalpha1        \n\t"\
182
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
183
                "punpcklwd %%mm5, %%mm5                \n\t"\
184
                "punpcklwd %%mm5, %%mm5                \n\t"\
185
                "movq %%mm5, asm_uvalpha1        \n\t"\
186
                "xorl %%eax, %%eax                \n\t"\
187
                "1:                                \n\t"\
188
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
189
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
190
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
191
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
192
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
193
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
194
                "movq asm_uvalpha1, %%mm0        \n\t"\
195
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
196
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
197
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
198
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
199
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
200
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
201
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
202
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
203
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
204
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
205
                "pmulhw ugCoeff, %%mm3                \n\t"\
206
                "pmulhw vgCoeff, %%mm4                \n\t"\
207
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
208
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
209
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
210
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
211
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
212
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
213
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
214
                "pmulhw asm_yalpha1, %%mm0        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
215
                "pmulhw asm_yalpha1, %%mm6        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
216
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
217
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
218
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
219
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
220
                "pmulhw ubCoeff, %%mm2                \n\t"\
221
                "pmulhw vrCoeff, %%mm5                \n\t"\
222
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
223
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
224
                "pmulhw yCoeff, %%mm1                \n\t"\
225
                "pmulhw yCoeff, %%mm7                \n\t"\
226
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
227
                "paddw %%mm3, %%mm4                \n\t"\
228
                "movq %%mm2, %%mm0                \n\t"\
229
                "movq %%mm5, %%mm6                \n\t"\
230
                "movq %%mm4, %%mm3                \n\t"\
231
                "punpcklwd %%mm2, %%mm2                \n\t"\
232
                "punpcklwd %%mm5, %%mm5                \n\t"\
233
                "punpcklwd %%mm4, %%mm4                \n\t"\
234
                "paddw %%mm1, %%mm2                \n\t"\
235
                "paddw %%mm1, %%mm5                \n\t"\
236
                "paddw %%mm1, %%mm4                \n\t"\
237
                "punpckhwd %%mm0, %%mm0                \n\t"\
238
                "punpckhwd %%mm6, %%mm6                \n\t"\
239
                "punpckhwd %%mm3, %%mm3                \n\t"\
240
                "paddw %%mm7, %%mm0                \n\t"\
241
                "paddw %%mm7, %%mm6                \n\t"\
242
                "paddw %%mm7, %%mm3                \n\t"\
243
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
244
                "packuswb %%mm0, %%mm2                \n\t"\
245
                "packuswb %%mm6, %%mm5                \n\t"\
246
                "packuswb %%mm3, %%mm4                \n\t"\
247
                "pxor %%mm7, %%mm7                \n\t"
248

    
249
#define YSCALEYUV2RGB1 \
250
                "xorl %%eax, %%eax                \n\t"\
251
                "1:                                \n\t"\
252
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
253
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
254
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
255
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
256
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
257
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
258
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
259
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
260
                "pmulhw ugCoeff, %%mm3                \n\t"\
261
                "pmulhw vgCoeff, %%mm4                \n\t"\
262
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
263
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
264
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
265
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
266
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
267
                "pmulhw ubCoeff, %%mm2                \n\t"\
268
                "pmulhw vrCoeff, %%mm5                \n\t"\
269
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
270
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
271
                "pmulhw yCoeff, %%mm1                \n\t"\
272
                "pmulhw yCoeff, %%mm7                \n\t"\
273
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
274
                "paddw %%mm3, %%mm4                \n\t"\
275
                "movq %%mm2, %%mm0                \n\t"\
276
                "movq %%mm5, %%mm6                \n\t"\
277
                "movq %%mm4, %%mm3                \n\t"\
278
                "punpcklwd %%mm2, %%mm2                \n\t"\
279
                "punpcklwd %%mm5, %%mm5                \n\t"\
280
                "punpcklwd %%mm4, %%mm4                \n\t"\
281
                "paddw %%mm1, %%mm2                \n\t"\
282
                "paddw %%mm1, %%mm5                \n\t"\
283
                "paddw %%mm1, %%mm4                \n\t"\
284
                "punpckhwd %%mm0, %%mm0                \n\t"\
285
                "punpckhwd %%mm6, %%mm6                \n\t"\
286
                "punpckhwd %%mm3, %%mm3                \n\t"\
287
                "paddw %%mm7, %%mm0                \n\t"\
288
                "paddw %%mm7, %%mm6                \n\t"\
289
                "paddw %%mm7, %%mm3                \n\t"\
290
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
291
                "packuswb %%mm0, %%mm2                \n\t"\
292
                "packuswb %%mm6, %%mm5                \n\t"\
293
                "packuswb %%mm3, %%mm4                \n\t"\
294
                "pxor %%mm7, %%mm7                \n\t"
295

    
296
// do vertical chrominance interpolation
297
#define YSCALEYUV2RGB1b \
298
                "xorl %%eax, %%eax                \n\t"\
299
                "1:                                \n\t"\
300
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
301
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
302
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
303
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
304
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
305
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
306
                "psrlw $5, %%mm3                \n\t"\
307
                "psrlw $5, %%mm4                \n\t"\
308
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
309
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
310
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
311
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
312
                "pmulhw ugCoeff, %%mm3                \n\t"\
313
                "pmulhw vgCoeff, %%mm4                \n\t"\
314
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
316
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
317
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
318
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
319
                "pmulhw ubCoeff, %%mm2                \n\t"\
320
                "pmulhw vrCoeff, %%mm5                \n\t"\
321
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
322
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
323
                "pmulhw yCoeff, %%mm1                \n\t"\
324
                "pmulhw yCoeff, %%mm7                \n\t"\
325
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
326
                "paddw %%mm3, %%mm4                \n\t"\
327
                "movq %%mm2, %%mm0                \n\t"\
328
                "movq %%mm5, %%mm6                \n\t"\
329
                "movq %%mm4, %%mm3                \n\t"\
330
                "punpcklwd %%mm2, %%mm2                \n\t"\
331
                "punpcklwd %%mm5, %%mm5                \n\t"\
332
                "punpcklwd %%mm4, %%mm4                \n\t"\
333
                "paddw %%mm1, %%mm2                \n\t"\
334
                "paddw %%mm1, %%mm5                \n\t"\
335
                "paddw %%mm1, %%mm4                \n\t"\
336
                "punpckhwd %%mm0, %%mm0                \n\t"\
337
                "punpckhwd %%mm6, %%mm6                \n\t"\
338
                "punpckhwd %%mm3, %%mm3                \n\t"\
339
                "paddw %%mm7, %%mm0                \n\t"\
340
                "paddw %%mm7, %%mm6                \n\t"\
341
                "paddw %%mm7, %%mm3                \n\t"\
342
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
343
                "packuswb %%mm0, %%mm2                \n\t"\
344
                "packuswb %%mm6, %%mm5                \n\t"\
345
                "packuswb %%mm3, %%mm4                \n\t"\
346
                "pxor %%mm7, %%mm7                \n\t"
347

    
348
#define WRITEBGR32 \
349
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
350
                        "movq %%mm2, %%mm1                \n\t" /* B */\
351
                        "movq %%mm5, %%mm6                \n\t" /* R */\
352
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
353
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
354
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
355
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
356
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
357
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
358
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
359
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
360
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
361
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
362
\
363
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
364
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
365
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
366
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
367
\
368
                        "addl $8, %%eax                        \n\t"\
369
                        "cmpl %5, %%eax                        \n\t"\
370
                        " jb 1b                                \n\t"
371

    
372
#define WRITEBGR16 \
373
                        "pand bF8, %%mm2                \n\t" /* B */\
374
                        "pand bFC, %%mm4                \n\t" /* G */\
375
                        "pand bF8, %%mm5                \n\t" /* R */\
376
                        "psrlq $3, %%mm2                \n\t"\
377
\
378
                        "movq %%mm2, %%mm1                \n\t"\
379
                        "movq %%mm4, %%mm3                \n\t"\
380
\
381
                        "punpcklbw %%mm7, %%mm3                \n\t"\
382
                        "punpcklbw %%mm5, %%mm2                \n\t"\
383
                        "punpckhbw %%mm7, %%mm4                \n\t"\
384
                        "punpckhbw %%mm5, %%mm1                \n\t"\
385
\
386
                        "psllq $3, %%mm3                \n\t"\
387
                        "psllq $3, %%mm4                \n\t"\
388
\
389
                        "por %%mm3, %%mm2                \n\t"\
390
                        "por %%mm4, %%mm1                \n\t"\
391
\
392
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
393
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
394
\
395
                        "addl $8, %%eax                        \n\t"\
396
                        "cmpl %5, %%eax                        \n\t"\
397
                        " jb 1b                                \n\t"
398

    
399
#define WRITEBGR15 \
400
                        "pand bF8, %%mm2                \n\t" /* B */\
401
                        "pand bF8, %%mm4                \n\t" /* G */\
402
                        "pand bF8, %%mm5                \n\t" /* R */\
403
                        "psrlq $3, %%mm2                \n\t"\
404
                        "psrlq $1, %%mm5                \n\t"\
405
\
406
                        "movq %%mm2, %%mm1                \n\t"\
407
                        "movq %%mm4, %%mm3                \n\t"\
408
\
409
                        "punpcklbw %%mm7, %%mm3                \n\t"\
410
                        "punpcklbw %%mm5, %%mm2                \n\t"\
411
                        "punpckhbw %%mm7, %%mm4                \n\t"\
412
                        "punpckhbw %%mm5, %%mm1                \n\t"\
413
\
414
                        "psllq $2, %%mm3                \n\t"\
415
                        "psllq $2, %%mm4                \n\t"\
416
\
417
                        "por %%mm3, %%mm2                \n\t"\
418
                        "por %%mm4, %%mm1                \n\t"\
419
\
420
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
421
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
422
\
423
                        "addl $8, %%eax                        \n\t"\
424
                        "cmpl %5, %%eax                        \n\t"\
425
                        " jb 1b                                \n\t"
426

    
427
// FIXME find a faster way to shuffle it to BGR24
428
#define WRITEBGR24 \
429
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
430
                        "movq %%mm2, %%mm1                \n\t" /* B */\
431
                        "movq %%mm5, %%mm6                \n\t" /* R */\
432
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
433
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
434
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
435
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
436
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
437
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
438
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
439
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
440
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
441
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
442
\
443
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
444
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
445
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 0 */\
446
                        "pand bm11111000, %%mm0                \n\t" /* 00RGB000 0.5 */\
447
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
448
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
449
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
450
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
451
\
452
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
453
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
454
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
455
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
456
                        "pand bm00001111, %%mm2                \n\t" /* 0000RGBR 1 */\
457
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
458
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
459
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 2 */\
460
                        "pand bm11111000, %%mm1                \n\t" /* 00RGB000 2.5 */\
461
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
462
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
463
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
464
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
465
\
466
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
467
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
468
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
469
                        "pand bm00000111, %%mm5                \n\t" /* 00000RGB 3 */\
470
                        "pand bm11111000, %%mm3                \n\t" /* 00RGB000 3.5 */\
471
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
472
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
473
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
474
\
475
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"\
476
                        MOVNTQ(%%mm0, (%4, %%ebx))\
477
                        MOVNTQ(%%mm2, 8(%4, %%ebx))\
478
                        MOVNTQ(%%mm3, 16(%4, %%ebx))\
479
\
480
                        "addl $8, %%eax                        \n\t"\
481
                        "cmpl %5, %%eax                        \n\t"\
482
                        " jb 1b                                \n\t"
483

    
484

    
485
static inline void yuv2yuv(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
486
                           uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstw, int yalpha, int uvalpha)
487
{
488
        int yalpha1=yalpha^4095;
489
        int uvalpha1=uvalpha^4095;
490
        int i;
491

    
492
        asm volatile ("\n\t"::: "memory");
493

    
494
        for(i=0;i<dstw;i++)
495
        {
496
                ((uint8_t*)dest)[i] = (buf0[i]*yalpha1+buf1[i]*yalpha)>>19;
497
        }
498

    
499
        if(uvalpha != -1)
500
        {
501
                for(i=0; i<(dstw>>1); i++)
502
                {
503
                        ((uint8_t*)uDest)[i] = (uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19;
504
                        ((uint8_t*)vDest)[i] = (uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19;
505
                }
506
        }
507
}
508

    
509
/**
510
 * vertical scale YV12 to RGB
511
 */
512
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
513
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
514
{
515
        int yalpha1=yalpha^4095;
516
        int uvalpha1=uvalpha^4095;
517
        int i;
518

    
519
        if(fullUVIpol)
520
        {
521

    
522
#ifdef HAVE_MMX
523
                if(dstbpp == 32)
524
                {
525
                        asm volatile(
526

    
527

    
528
FULL_YSCALEYUV2RGB
529
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
530
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
531

    
532
                        "movq %%mm3, %%mm1                \n\t"
533
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
534
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
535

    
536
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
537
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
538

    
539
                        "addl $4, %%eax                        \n\t"
540
                        "cmpl %5, %%eax                        \n\t"
541
                        " jb 1b                                \n\t"
542

    
543

    
544
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
545
                        "m" (yalpha1), "m" (uvalpha1)
546
                        : "%eax"
547
                        );
548
                }
549
                else if(dstbpp==24)
550
                {
551
                        asm volatile(
552

    
553
FULL_YSCALEYUV2RGB
554

    
555
                                                                // lsb ... msb
556
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
557
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
558

    
559
                        "movq %%mm3, %%mm1                \n\t"
560
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
561
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
562

    
563
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
564
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
565
                        "pand bm00000111, %%mm2                \n\t" // BGR00000
566
                        "pand bm11111000, %%mm3                \n\t" // 000BGR00
567
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
568
                        "movq %%mm1, %%mm2                \n\t"
569
                        "psllq $48, %%mm1                \n\t" // 000000BG
570
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
571

    
572
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
573
                        "psrld $16, %%mm2                \n\t" // R000R000
574
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
575
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
576

    
577
                        "movl %4, %%ebx                        \n\t"
578
                        "addl %%eax, %%ebx                \n\t"
579

    
580
#ifdef HAVE_MMX2
581
                        //FIXME Alignment
582
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
583
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
584
#else
585
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
586
                        "psrlq $32, %%mm3                \n\t"
587
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
588
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
589
#endif
590
                        "addl $4, %%eax                        \n\t"
591
                        "cmpl %5, %%eax                        \n\t"
592
                        " jb 1b                                \n\t"
593

    
594
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
595
                        "m" (yalpha1), "m" (uvalpha1)
596
                        : "%eax", "%ebx"
597
                        );
598
                }
599
                else if(dstbpp==15)
600
                {
601
                        asm volatile(
602

    
603
FULL_YSCALEYUV2RGB
604
#ifdef DITHER1XBPP
605
                        "paddusb b16Dither, %%mm1        \n\t"
606
                        "paddusb b16Dither, %%mm0        \n\t"
607
                        "paddusb b16Dither, %%mm3        \n\t"
608
#endif
609
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
610
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
611
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
612

    
613
                        "psrlw $3, %%mm3                \n\t"
614
                        "psllw $2, %%mm1                \n\t"
615
                        "psllw $7, %%mm0                \n\t"
616
                        "pand g15Mask, %%mm1                \n\t"
617
                        "pand r15Mask, %%mm0                \n\t"
618

    
619
                        "por %%mm3, %%mm1                \n\t"
620
                        "por %%mm1, %%mm0                \n\t"
621

    
622
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
623

    
624
                        "addl $4, %%eax                        \n\t"
625
                        "cmpl %5, %%eax                        \n\t"
626
                        " jb 1b                                \n\t"
627

    
628
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
629
                        "m" (yalpha1), "m" (uvalpha1)
630
                        : "%eax"
631
                        );
632
                }
633
                else if(dstbpp==16)
634
                {
635
                        asm volatile(
636

    
637
FULL_YSCALEYUV2RGB
638
#ifdef DITHER1XBPP
639
                        "paddusb g16Dither, %%mm1        \n\t"
640
                        "paddusb b16Dither, %%mm0        \n\t"
641
                        "paddusb b16Dither, %%mm3        \n\t"
642
#endif
643
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
644
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
645
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
646

    
647
                        "psrlw $3, %%mm3                \n\t"
648
                        "psllw $3, %%mm1                \n\t"
649
                        "psllw $8, %%mm0                \n\t"
650
                        "pand g16Mask, %%mm1                \n\t"
651
                        "pand r16Mask, %%mm0                \n\t"
652

    
653
                        "por %%mm3, %%mm1                \n\t"
654
                        "por %%mm1, %%mm0                \n\t"
655

    
656
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
657

    
658
                        "addl $4, %%eax                        \n\t"
659
                        "cmpl %5, %%eax                        \n\t"
660
                        " jb 1b                                \n\t"
661

    
662
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
663
                        "m" (yalpha1), "m" (uvalpha1)
664
                        : "%eax"
665
                        );
666
                }
667
#else
668
                asm volatile ("\n\t"::: "memory");
669

    
670
                if(dstbpp==32 || dstbpp==24)
671
                {
672
                        for(i=0;i<dstw;i++){
673
                                // vertical linear interpolation && yuv2rgb in a single step:
674
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
675
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
676
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
677
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
678
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
679
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
680
                                dest+=dstbpp>>3;
681
                        }
682
                }
683
                else if(dstbpp==16)
684
                {
685
                        for(i=0;i<dstw;i++){
686
                                // vertical linear interpolation && yuv2rgb in a single step:
687
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
688
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
689
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
690

    
691
                                ((uint16_t*)dest)[i] =
692
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
693
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
694
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
695
                        }
696
                }
697
                else if(dstbpp==15)
698
                {
699
                        for(i=0;i<dstw;i++){
700
                                // vertical linear interpolation && yuv2rgb in a single step:
701
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
702
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
703
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
704

    
705
                                ((uint16_t*)dest)[i] =
706
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
707
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
708
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
709
                        }
710
                }
711
#endif
712
        }//FULL_UV_IPOL
713
        else
714
        {
715
#ifdef HAVE_MMX
716
                if(dstbpp == 32)
717
                {
718
                        asm volatile(
719
                                YSCALEYUV2RGB
720
                                WRITEBGR32
721

    
722
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
723
                        "m" (yalpha1), "m" (uvalpha1)
724
                        : "%eax"
725
                        );
726
                }
727
                else if(dstbpp==24)
728
                {
729
                        asm volatile(
730
                                YSCALEYUV2RGB
731
                                WRITEBGR24
732

    
733
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
734
                        "m" (yalpha1), "m" (uvalpha1)
735
                        : "%eax", "%ebx"
736
                        );
737
                }
738
                else if(dstbpp==15)
739
                {
740
                        asm volatile(
741
                                YSCALEYUV2RGB
742
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
743
#ifdef DITHER1XBPP
744
                                "paddusb b16Dither, %%mm2        \n\t"
745
                                "paddusb b16Dither, %%mm4        \n\t"
746
                                "paddusb b16Dither, %%mm5        \n\t"
747
#endif
748

    
749
                                WRITEBGR15
750

    
751
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
752
                        "m" (yalpha1), "m" (uvalpha1)
753
                        : "%eax"
754
                        );
755
                }
756
                else if(dstbpp==16)
757
                {
758
                        asm volatile(
759
                                YSCALEYUV2RGB
760
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
761
#ifdef DITHER1XBPP
762
                                "paddusb g16Dither, %%mm2        \n\t"
763
                                "paddusb b16Dither, %%mm4        \n\t"
764
                                "paddusb b16Dither, %%mm5        \n\t"
765
#endif
766

    
767
                                WRITEBGR16
768

    
769
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
770
                        "m" (yalpha1), "m" (uvalpha1)
771
                        : "%eax"
772
                        );
773
                }
774
#else
775
                asm volatile ("\n\t"::: "memory");
776

    
777
                if(dstbpp==32)
778
                {
779
                        for(i=0; i<dstw-1; i+=2){
780
                                // vertical linear interpolation && yuv2rgb in a single step:
781
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
782
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
783
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
784
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
785

    
786
                                int Cb= yuvtab_40cf[U];
787
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
788
                                int Cr= yuvtab_3343[V];
789

    
790
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
791
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
792
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
793

    
794
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
795
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
796
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
797
                        }
798
                }
799
                if(dstbpp==24)
800
                {
801
                        for(i=0; i<dstw-1; i+=2){
802
                                // vertical linear interpolation && yuv2rgb in a single step:
803
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
804
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
805
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
806
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
807

    
808
                                int Cb= yuvtab_40cf[U];
809
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
810
                                int Cr= yuvtab_3343[V];
811

    
812
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
813
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
814
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
815

    
816
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
817
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
818
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
819
                                dest+=6;
820
                        }
821
                }
822
                else if(dstbpp==16)
823
                {
824
                        for(i=0; i<dstw-1; i+=2){
825
                                // vertical linear interpolation && yuv2rgb in a single step:
826
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
827
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
828
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
829
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
830

    
831
                                int Cb= yuvtab_40cf[U];
832
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
833
                                int Cr= yuvtab_3343[V];
834

    
835
                                ((uint16_t*)dest)[i] =
836
                                        clip_table16b[(Y1 + Cb) >>13] |
837
                                        clip_table16g[(Y1 + Cg) >>13] |
838
                                        clip_table16r[(Y1 + Cr) >>13];
839

    
840
                                ((uint16_t*)dest)[i+1] =
841
                                        clip_table16b[(Y2 + Cb) >>13] |
842
                                        clip_table16g[(Y2 + Cg) >>13] |
843
                                        clip_table16r[(Y2 + Cr) >>13];
844
                        }
845
                }
846
                else if(dstbpp==15)
847
                {
848
                        for(i=0; i<dstw-1; i+=2){
849
                                // vertical linear interpolation && yuv2rgb in a single step:
850
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
851
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
852
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
853
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
854

    
855
                                int Cb= yuvtab_40cf[U];
856
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
857
                                int Cr= yuvtab_3343[V];
858

    
859
                                ((uint16_t*)dest)[i] =
860
                                        clip_table15b[(Y1 + Cb) >>13] |
861
                                        clip_table15g[(Y1 + Cg) >>13] |
862
                                        clip_table15r[(Y1 + Cr) >>13];
863

    
864
                                ((uint16_t*)dest)[i+1] =
865
                                        clip_table15b[(Y2 + Cb) >>13] |
866
                                        clip_table15g[(Y2 + Cg) >>13] |
867
                                        clip_table15r[(Y2 + Cr) >>13];
868
                        }
869
                }
870
#endif
871
        } //!FULL_UV_IPOL
872
}
873

    
874
/**
875
 * YV12 to RGB without scaling or interpolating
876
 */
877
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
878
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
879
{
880
        int yalpha1=yalpha^4095;
881
        int uvalpha1=uvalpha^4095;
882
        int i;
883
        if(fullUVIpol || allwaysIpol)
884
        {
885
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
886
                return;
887
        }
888
        if( yalpha > 2048 ) buf0 = buf1;
889

    
890
#ifdef HAVE_MMX
891
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
892
        {
893
                if(dstbpp == 32)
894
                {
895
                        asm volatile(
896
                                YSCALEYUV2RGB1
897
                                WRITEBGR32
898
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
899
                        "m" (yalpha1), "m" (uvalpha1)
900
                        : "%eax"
901
                        );
902
                }
903
                else if(dstbpp==24)
904
                {
905
                        asm volatile(
906
                                YSCALEYUV2RGB1
907
                                WRITEBGR24
908
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
909
                        "m" (yalpha1), "m" (uvalpha1)
910
                        : "%eax", "%ebx"
911
                        );
912
                }
913
                else if(dstbpp==15)
914
                {
915
                        asm volatile(
916
                                YSCALEYUV2RGB1
917
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
918
#ifdef DITHER1XBPP
919
                                "paddusb b16Dither, %%mm2        \n\t"
920
                                "paddusb b16Dither, %%mm4        \n\t"
921
                                "paddusb b16Dither, %%mm5        \n\t"
922
#endif
923
                                WRITEBGR15
924
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
925
                        "m" (yalpha1), "m" (uvalpha1)
926
                        : "%eax"
927
                        );
928
                }
929
                else if(dstbpp==16)
930
                {
931
                        asm volatile(
932
                                YSCALEYUV2RGB1
933
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
934
#ifdef DITHER1XBPP
935
                                "paddusb g16Dither, %%mm2        \n\t"
936
                                "paddusb b16Dither, %%mm4        \n\t"
937
                                "paddusb b16Dither, %%mm5        \n\t"
938
#endif
939

    
940
                                WRITEBGR16
941
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
942
                        "m" (yalpha1), "m" (uvalpha1)
943
                        : "%eax"
944
                        );
945
                }
946
        }
947
        else
948
        {
949
                if(dstbpp == 32)
950
                {
951
                        asm volatile(
952
                                YSCALEYUV2RGB1b
953
                                WRITEBGR32
954
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
955
                        "m" (yalpha1), "m" (uvalpha1)
956
                        : "%eax"
957
                        );
958
                }
959
                else if(dstbpp==24)
960
                {
961
                        asm volatile(
962
                                YSCALEYUV2RGB1b
963
                                WRITEBGR24
964
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
965
                        "m" (yalpha1), "m" (uvalpha1)
966
                        : "%eax", "%ebx"
967
                        );
968
                }
969
                else if(dstbpp==15)
970
                {
971
                        asm volatile(
972
                                YSCALEYUV2RGB1b
973
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
974
#ifdef DITHER1XBPP
975
                                "paddusb b16Dither, %%mm2        \n\t"
976
                                "paddusb b16Dither, %%mm4        \n\t"
977
                                "paddusb b16Dither, %%mm5        \n\t"
978
#endif
979
                                WRITEBGR15
980
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
981
                        "m" (yalpha1), "m" (uvalpha1)
982
                        : "%eax"
983
                        );
984
                }
985
                else if(dstbpp==16)
986
                {
987
                        asm volatile(
988
                                YSCALEYUV2RGB1b
989
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
990
#ifdef DITHER1XBPP
991
                                "paddusb g16Dither, %%mm2        \n\t"
992
                                "paddusb b16Dither, %%mm4        \n\t"
993
                                "paddusb b16Dither, %%mm5        \n\t"
994
#endif
995

    
996
                                WRITEBGR16
997
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
998
                        "m" (yalpha1), "m" (uvalpha1)
999
                        : "%eax"
1000
                        );
1001
                }
1002
        }
1003
#else
1004
//FIXME write 2 versions (for even & odd lines)
1005
        asm volatile ("\n\t"::: "memory");
1006

    
1007
        if(dstbpp==32)
1008
        {
1009
                for(i=0; i<dstw-1; i+=2){
1010
                        // vertical linear interpolation && yuv2rgb in a single step:
1011
                        int Y1=yuvtab_2568[buf0[i]>>7];
1012
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1013
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1014
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1015

    
1016
                        int Cb= yuvtab_40cf[U];
1017
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1018
                        int Cr= yuvtab_3343[V];
1019

    
1020
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1021
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1022
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1023

    
1024
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1025
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1026
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1027
                }
1028
        }
1029
        if(dstbpp==24)
1030
        {
1031
                for(i=0; i<dstw-1; i+=2){
1032
                        // vertical linear interpolation && yuv2rgb in a single step:
1033
                        int Y1=yuvtab_2568[buf0[i]>>7];
1034
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1035
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1036
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1037

    
1038
                        int Cb= yuvtab_40cf[U];
1039
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1040
                        int Cr= yuvtab_3343[V];
1041

    
1042
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1043
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1044
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1045

    
1046
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1047
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1048
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1049
                        dest+=6;
1050
                }
1051
        }
1052
        else if(dstbpp==16)
1053
        {
1054
                for(i=0; i<dstw-1; i+=2){
1055
                        // vertical linear interpolation && yuv2rgb in a single step:
1056
                        int Y1=yuvtab_2568[buf0[i]>>7];
1057
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1058
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1059
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1060

    
1061
                        int Cb= yuvtab_40cf[U];
1062
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1063
                        int Cr= yuvtab_3343[V];
1064

    
1065
                        ((uint16_t*)dest)[i] =
1066
                                clip_table16b[(Y1 + Cb) >>13] |
1067
                                clip_table16g[(Y1 + Cg) >>13] |
1068
                                clip_table16r[(Y1 + Cr) >>13];
1069

    
1070
                        ((uint16_t*)dest)[i+1] =
1071
                                clip_table16b[(Y2 + Cb) >>13] |
1072
                                clip_table16g[(Y2 + Cg) >>13] |
1073
                                clip_table16r[(Y2 + Cr) >>13];
1074
                }
1075
        }
1076
        else if(dstbpp==15)
1077
        {
1078
                for(i=0; i<dstw-1; i+=2){
1079
                        // vertical linear interpolation && yuv2rgb in a single step:
1080
                        int Y1=yuvtab_2568[buf0[i]>>7];
1081
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1082
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1083
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1084

    
1085
                        int Cb= yuvtab_40cf[U];
1086
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1087
                        int Cr= yuvtab_3343[V];
1088

    
1089
                        ((uint16_t*)dest)[i] =
1090
                                clip_table15b[(Y1 + Cb) >>13] |
1091
                                clip_table15g[(Y1 + Cg) >>13] |
1092
                                clip_table15r[(Y1 + Cr) >>13];
1093

    
1094
                        ((uint16_t*)dest)[i+1] =
1095
                                clip_table15b[(Y2 + Cb) >>13] |
1096
                                clip_table15g[(Y2 + Cg) >>13] |
1097
                                clip_table15r[(Y2 + Cr) >>13];
1098
                }
1099
        }
1100
#endif
1101
}
1102

    
1103

    
1104
static inline void hyscale(uint16_t *dst, int dstWidth, uint8_t *src, int srcWidth, int xInc)
1105
{
1106
        int i;
1107
      unsigned int xpos=0;
1108
      // *** horizontal scale Y line to temp buffer
1109
#ifdef ARCH_X86
1110
#ifdef HAVE_MMX2
1111
        if(canMMX2BeUsed)
1112
        {
1113
                asm volatile(
1114
                        "pxor %%mm7, %%mm7                \n\t"
1115
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1116
                        "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1117
                        "punpcklwd %%mm6, %%mm6                \n\t"
1118
                        "punpcklwd %%mm6, %%mm6                \n\t"
1119
                        "movq %%mm6, %%mm2                \n\t"
1120
                        "psllq $16, %%mm2                \n\t"
1121
                        "paddw %%mm6, %%mm2                \n\t"
1122
                        "psllq $16, %%mm2                \n\t"
1123
                        "paddw %%mm6, %%mm2                \n\t"
1124
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFF
1125
                        "movq %%mm2, temp0                \n\t"
1126
                        "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1127
                        "punpcklwd %%mm6, %%mm6                \n\t"
1128
                        "punpcklwd %%mm6, %%mm6                \n\t"
1129
                        "xorl %%eax, %%eax                \n\t" // i
1130
                        "movl %0, %%esi                        \n\t" // src
1131
                        "movl %1, %%edi                        \n\t" // buf1
1132
                        "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1133
                        "xorl %%ecx, %%ecx                \n\t"
1134
                        "xorl %%ebx, %%ebx                \n\t"
1135
                        "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1136

    
1137
#define FUNNY_Y_CODE \
1138
                        PREFETCH" 1024(%%esi)                \n\t"\
1139
                        PREFETCH" 1056(%%esi)                \n\t"\
1140
                        PREFETCH" 1088(%%esi)                \n\t"\
1141
                        "call funnyYCode                \n\t"\
1142
                        "movq temp0, %%mm2                \n\t"\
1143
                        "xorl %%ecx, %%ecx                \n\t"
1144

    
1145
FUNNY_Y_CODE
1146
FUNNY_Y_CODE
1147
FUNNY_Y_CODE
1148
FUNNY_Y_CODE
1149
FUNNY_Y_CODE
1150
FUNNY_Y_CODE
1151
FUNNY_Y_CODE
1152
FUNNY_Y_CODE
1153

    
1154
                        :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1155
                        "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1156
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1157
                );
1158
                for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth-1; i--) dst[i] = src[srcWidth-1]*128;
1159
        }
1160
        else
1161
        {
1162
#endif
1163
        //NO MMX just normal asm ...
1164
        asm volatile(
1165
                "xorl %%eax, %%eax                \n\t" // i
1166
                "xorl %%ebx, %%ebx                \n\t" // xx
1167
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1168
                "1:                                \n\t"
1169
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
1170
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1171
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1172
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1173
                "shll $16, %%edi                \n\t"
1174
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1175
                "movl %1, %%edi                        \n\t"
1176
                "shrl $9, %%esi                        \n\t"
1177
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1178
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1179
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1180

    
1181
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
1182
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1183
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1184
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1185
                "shll $16, %%edi                \n\t"
1186
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1187
                "movl %1, %%edi                        \n\t"
1188
                "shrl $9, %%esi                        \n\t"
1189
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
1190
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1191
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1192

    
1193

    
1194
                "addl $2, %%eax                        \n\t"
1195
                "cmpl %2, %%eax                        \n\t"
1196
                " jb 1b                                \n\t"
1197

    
1198

    
1199
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1200
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1201
                );
1202
#ifdef HAVE_MMX2
1203
        } //if MMX2 cant be used
1204
#endif
1205
#else
1206
      for(i=0;i<dstWidth;i++){
1207
        register unsigned int xx=xpos>>16;
1208
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
1209
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1210
        xpos+=xInc;
1211
      }
1212
#endif
1213
}
1214

    
1215
inline static void hcscale(uint16_t *dst, int dstWidth,
1216
                                uint8_t *src1, uint8_t *src2, int srcWidth, int xInc)
1217
{
1218
        int xpos=0;
1219
        int i;
1220
#ifdef ARCH_X86
1221
#ifdef HAVE_MMX2
1222
        if(canMMX2BeUsed)
1223
        {
1224
                asm volatile(
1225
                "pxor %%mm7, %%mm7                \n\t"
1226
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1227
                "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1228
                "punpcklwd %%mm6, %%mm6                \n\t"
1229
                "punpcklwd %%mm6, %%mm6                \n\t"
1230
                "movq %%mm6, %%mm2                \n\t"
1231
                "psllq $16, %%mm2                \n\t"
1232
                "paddw %%mm6, %%mm2                \n\t"
1233
                "psllq $16, %%mm2                \n\t"
1234
                "paddw %%mm6, %%mm2                \n\t"
1235
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFFFF
1236
                "movq %%mm2, temp0                \n\t"
1237
                "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1238
                "punpcklwd %%mm6, %%mm6                \n\t"
1239
                "punpcklwd %%mm6, %%mm6                \n\t"
1240
                "xorl %%eax, %%eax                \n\t" // i
1241
                "movl %0, %%esi                        \n\t" // src
1242
                "movl %1, %%edi                        \n\t" // buf1
1243
                "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1244
                "xorl %%ecx, %%ecx                \n\t"
1245
                "xorl %%ebx, %%ebx                \n\t"
1246
                "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1247

    
1248
#define FUNNYUVCODE \
1249
                        PREFETCH" 1024(%%esi)                \n\t"\
1250
                        PREFETCH" 1056(%%esi)                \n\t"\
1251
                        PREFETCH" 1088(%%esi)                \n\t"\
1252
                        "call funnyUVCode                \n\t"\
1253
                        "movq temp0, %%mm2                \n\t"\
1254
                        "xorl %%ecx, %%ecx                \n\t"
1255

    
1256
FUNNYUVCODE
1257
FUNNYUVCODE
1258
FUNNYUVCODE
1259
FUNNYUVCODE
1260

    
1261
FUNNYUVCODE
1262
FUNNYUVCODE
1263
FUNNYUVCODE
1264
FUNNYUVCODE
1265
                "xorl %%eax, %%eax                \n\t" // i
1266
                "movl %6, %%esi                        \n\t" // src
1267
                "movl %1, %%edi                        \n\t" // buf1
1268
                "addl $4096, %%edi                \n\t"
1269

    
1270
FUNNYUVCODE
1271
FUNNYUVCODE
1272
FUNNYUVCODE
1273
FUNNYUVCODE
1274

    
1275
FUNNYUVCODE
1276
FUNNYUVCODE
1277
FUNNYUVCODE
1278
FUNNYUVCODE
1279

    
1280
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1281
                  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1282
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1283
        );
1284
                for(i=dstWidth-1; (i*xInc)>>16 >=srcWidth/2-1; i--)
1285
                {
1286
                        dst[i] = src1[srcWidth/2-1]*128;
1287
                        dst[i+2048] = src2[srcWidth/2-1]*128;
1288
                }
1289
        }
1290
        else
1291
        {
1292
#endif
1293
        asm volatile(
1294
                "xorl %%eax, %%eax                \n\t" // i
1295
                "xorl %%ebx, %%ebx                \n\t" // xx
1296
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1297
                "1:                                \n\t"
1298
                "movl %0, %%esi                        \n\t"
1299
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
1300
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
1301
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1302
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1303
                "shll $16, %%edi                \n\t"
1304
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1305
                "movl %1, %%edi                        \n\t"
1306
                "shrl $9, %%esi                        \n\t"
1307
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1308

    
1309
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
1310
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
1311
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1312
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1313
                "shll $16, %%edi                \n\t"
1314
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1315
                "movl %1, %%edi                        \n\t"
1316
                "shrl $9, %%esi                        \n\t"
1317
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1318

    
1319
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1320
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1321
                "addl $1, %%eax                        \n\t"
1322
                "cmpl %2, %%eax                        \n\t"
1323
                " jb 1b                                \n\t"
1324

    
1325
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1326
                "r" (src2)
1327
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1328
                );
1329
#ifdef HAVE_MMX2
1330
        } //if MMX2 cant be used
1331
#endif
1332
#else
1333
      for(i=0;i<dstWidth;i++){
1334
          register unsigned int xx=xpos>>16;
1335
          register unsigned int xalpha=(xpos&0xFFFF)>>9;
1336
          dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1337
          dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1338
/* slower
1339
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1340
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1341
*/
1342
          xpos+=xInc;
1343
      }
1344
#endif
1345
}
1346

    
1347

    
1348
// *** bilinear scaling and yuv->rgb or yuv->yuv conversion of yv12 slices:
1349
// *** Note: it's called multiple times while decoding a frame, first time y==0
1350
// *** Designed to upscale, but may work for downscale too.
1351
// s_xinc = (src_width << 16) / dst_width
1352
// s_yinc = (src_height << 16) / dst_height
1353
void SwScale_YV12slice(unsigned char* srcptr[],int stride[], int y, int h,
1354
                             uint8_t* dstptr[], int dststride, int dstw, int dstbpp,
1355
                             unsigned int s_xinc,unsigned int s_yinc){
1356

    
1357
// scaling factors:
1358
//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
1359
//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
1360

    
1361
unsigned int s_xinc2;
1362

    
1363
static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1364
static int s_ypos;
1365

    
1366
// last horzontally interpolated lines, used to avoid unnecessary calculations
1367
static int s_last_ypos;
1368
static int s_last_y1pos;
1369

    
1370
static int static_dstw;
1371

    
1372
#ifdef HAVE_MMX2
1373
// used to detect a horizontal size change
1374
static int old_dstw= -1;
1375
static int old_s_xinc= -1;
1376
#endif
1377

    
1378
int srcWidth= (dstw*s_xinc + 0x8000)>>16;
1379
int dstUVw= fullUVIpol ? dstw : dstw/2;
1380
int i;
1381

    
1382
#ifdef HAVE_MMX2
1383
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
1384
#endif
1385

    
1386
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
1387
// n-2 is the last chrominance sample available
1388
// FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
1389
// would be like the vertical one, but that would require some special code for the
1390
// first and last pixel
1391
if(canMMX2BeUsed)         s_xinc+= 20;
1392
else                        s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
1393

    
1394
if(fullUVIpol && !(dstbpp==12))         s_xinc2= s_xinc>>1;
1395
else                                        s_xinc2= s_xinc;
1396
  // force calculation of the horizontal interpolation of the first line
1397

    
1398
  if(y==0){
1399
        s_last_ypos=-99;
1400
        s_last_y1pos=-99;
1401
        s_srcypos= s_yinc/2 - 0x8000;
1402
        s_ypos=0;
1403

    
1404
        // clean the buffers so that no green stuff is drawen if the width is not sane (%8=0)
1405
        for(i=dstw-2; i<dstw+20; i++)
1406
        {
1407
                pix_buf_uv[0][i] = pix_buf_uv[1][i]
1408
                = pix_buf_uv[0][2048+i] = pix_buf_uv[1][2048+i] = 128;
1409
                pix_buf_uv[0][i/2] = pix_buf_uv[1][i/2]
1410
                = pix_buf_uv[0][2048+i/2] = pix_buf_uv[1][2048+i/2] = 128;
1411
                pix_buf_y[0][i]= pix_buf_y[1][i]= 0;
1412
        }
1413

    
1414
#ifdef HAVE_MMX2
1415
// cant downscale !!!
1416
        if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
1417
        {
1418
                uint8_t *fragment;
1419
                int imm8OfPShufW1;
1420
                int imm8OfPShufW2;
1421
                int fragmentLength;
1422

    
1423
                int xpos, xx, xalpha, i;
1424

    
1425
                old_s_xinc= s_xinc;
1426
                old_dstw= dstw;
1427

    
1428
                static_dstw= dstw;
1429

    
1430
                // create an optimized horizontal scaling routine
1431

    
1432
                //code fragment
1433

    
1434
                asm volatile(
1435
                        "jmp 9f                                \n\t"
1436
                // Begin
1437
                        "0:                                \n\t"
1438
                        "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
1439
                        "movq %%mm0, %%mm1                \n\t"
1440
                        "psrlq $8, %%mm0                \n\t"
1441
                        "punpcklbw %%mm7, %%mm1        \n\t"
1442
                        "movq %%mm2, %%mm3                \n\t"
1443
                        "punpcklbw %%mm7, %%mm0        \n\t"
1444
                        "addw %%bx, %%cx                \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
1445
                        "pshufw $0xFF, %%mm1, %%mm1        \n\t"
1446
                        "1:                                \n\t"
1447
                        "adcl %%edx, %%esi                \n\t" //xx+= (4*s_xinc)>>16 + carry
1448
                        "pshufw $0xFF, %%mm0, %%mm0        \n\t"
1449
                        "2:                                \n\t"
1450
                        "psrlw $9, %%mm3                \n\t"
1451
                        "psubw %%mm1, %%mm0                \n\t"
1452
                        "pmullw %%mm3, %%mm0                \n\t"
1453
                        "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
1454
                        "psllw $7, %%mm1                \n\t"
1455
                        "paddw %%mm1, %%mm0                \n\t"
1456

    
1457
                        "movq %%mm0, (%%edi, %%eax)        \n\t"
1458

    
1459
                        "addl $8, %%eax                        \n\t"
1460
                // End
1461
                        "9:                                \n\t"
1462
//                "int $3\n\t"
1463
                        "leal 0b, %0                        \n\t"
1464
                        "leal 1b, %1                        \n\t"
1465
                        "leal 2b, %2                        \n\t"
1466
                        "decl %1                        \n\t"
1467
                        "decl %2                        \n\t"
1468
                        "subl %0, %1                        \n\t"
1469
                        "subl %0, %2                        \n\t"
1470
                        "leal 9b, %3                        \n\t"
1471
                        "subl %0, %3                        \n\t"
1472
                        :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
1473
                         "=r" (fragmentLength)
1474
                );
1475

    
1476
                xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
1477

    
1478
                /* choose xinc so that all 8 parts fit exactly
1479
                   Note: we cannot use just 1 part because it would not fit in the code cache */
1480
//                s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
1481
//                s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
1482
#ifdef ALT_ERROR
1483
//                s_xinc2_diff+= ((0x10000/(dstw/8)));
1484
#endif
1485
//                s_xinc_diff= s_xinc2_diff*2;
1486

    
1487
//                s_xinc2+= s_xinc2_diff;
1488
//                s_xinc+= s_xinc_diff;
1489

    
1490
//                old_s_xinc= s_xinc;
1491

    
1492
                for(i=0; i<dstw/8; i++)
1493
                {
1494
                        int xx=xpos>>16;
1495

    
1496
                        if((i&3) == 0)
1497
                        {
1498
                                int a=0;
1499
                                int b=((xpos+s_xinc)>>16) - xx;
1500
                                int c=((xpos+s_xinc*2)>>16) - xx;
1501
                                int d=((xpos+s_xinc*3)>>16) - xx;
1502

    
1503
                                memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
1504

    
1505
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
1506
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
1507
                                        a | (b<<2) | (c<<4) | (d<<6);
1508

    
1509
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1510
                                // crossing a cache line
1511
                                if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1512

    
1513
                                funnyYCode[fragmentLength*(i+4)/4]= RET;
1514
                        }
1515
                        xpos+=s_xinc;
1516
                }
1517

    
1518
                xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
1519
                for(i=0; i<dstUVw/8; i++)
1520
                {
1521
                        int xx=xpos>>16;
1522

    
1523
                        if((i&3) == 0)
1524
                        {
1525
                                int a=0;
1526
                                int b=((xpos+s_xinc2)>>16) - xx;
1527
                                int c=((xpos+s_xinc2*2)>>16) - xx;
1528
                                int d=((xpos+s_xinc2*3)>>16) - xx;
1529

    
1530
                                memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
1531

    
1532
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
1533
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
1534
                                        a | (b<<2) | (c<<4) | (d<<6);
1535

    
1536
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1537
                                // crossing a cache line
1538
                                if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1539

    
1540
                                funnyUVCode[fragmentLength*(i+4)/4]= RET;
1541
                        }
1542
                        xpos+=s_xinc2;
1543
                }
1544
//                funnyCode[0]= RET;
1545
        }
1546

    
1547
#endif // HAVE_MMX2
1548
  } // reset counters
1549

    
1550
  while(1){
1551
    unsigned char *dest =dstptr[0]+dststride*s_ypos;
1552
    unsigned char *uDest=dstptr[1]+(dststride>>1)*(s_ypos>>1);
1553
    unsigned char *vDest=dstptr[2]+(dststride>>1)*(s_ypos>>1);
1554

    
1555
    int y0=(s_srcypos + 0xFFFF)>>16;  // first luminance source line number below the dst line
1556
        // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1557
    int srcuvpos= dstbpp==12 ?        s_srcypos + s_yinc/2 - 0x8000 :
1558
                                    s_srcypos - 0x8000;
1559
    int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
1560
    int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1561
    int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
1562
    uint16_t *buf0=pix_buf_y[y0&1];                // top line of the interpolated slice
1563
    uint16_t *buf1=pix_buf_y[((y0+1)&1)];        // bottom line of the interpolated slice
1564
    uint16_t *uvbuf0=pix_buf_uv[y1&1];                // top line of the interpolated slice
1565
    uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1];        // bottom line of the interpolated slice
1566
    int i;
1567

    
1568
    if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1569

    
1570
    if((y0&1) && dstbpp==12) uvalpha=-1; // there is no alpha if there is no line
1571

    
1572
    s_ypos++; s_srcypos+=s_yinc;
1573

    
1574
    //only interpolate the src line horizontally if we didnt do it allready
1575
        if(s_last_ypos!=y0)
1576
        {
1577
                unsigned char *src;
1578
                // skip if first line has been horiz scaled alleady
1579
                if(s_last_ypos != y0-1)
1580
                {
1581
                        // check if first line is before any available src lines
1582
                        if(y0-1 < y)         src=srcptr[0]+(0     )*stride[0];
1583
                        else                src=srcptr[0]+(y0-y-1)*stride[0];
1584

    
1585
                        hyscale(buf0, dstw, src, srcWidth, s_xinc);
1586
                }
1587
                // check if second line is after any available src lines
1588
                if(y0-y >= h)        src=srcptr[0]+(h-1)*stride[0];
1589
                else                src=srcptr[0]+(y0-y)*stride[0];
1590

    
1591
                // the min() is required to avoid reuseing lines which where not available
1592
                s_last_ypos= MIN(y0, y+h-1);
1593
                hyscale(buf1, dstw, src, srcWidth, s_xinc);
1594
        }
1595
//        printf("%d %d %d %d\n", y, y1, s_last_y1pos, h);
1596
      // *** horizontal scale U and V lines to temp buffer
1597
        if(s_last_y1pos!=y1)
1598
        {
1599
                uint8_t *src1, *src2;
1600
                // skip if first line has been horiz scaled alleady
1601
                if(s_last_y1pos != y1-1)
1602
                {
1603
                        // check if first line is before any available src lines
1604
                        if(y1-y/2-1 < 0)
1605
                        {
1606
                                src1= srcptr[1]+(0)*stride[1];
1607
                                src2= srcptr[2]+(0)*stride[2];
1608
                        }else{
1609
                                src1= srcptr[1]+(y1-y/2-1)*stride[1];
1610
                                src2= srcptr[2]+(y1-y/2-1)*stride[2];
1611
                        }
1612
                        hcscale(uvbuf0, dstUVw, src1, src2, srcWidth, s_xinc2);
1613
                }
1614

    
1615
                // check if second line is after any available src lines
1616
                if(y1 - y/2 >= h/2)
1617
                {
1618
                        src1= srcptr[1]+(h/2-1)*stride[1];
1619
                        src2= srcptr[2]+(h/2-1)*stride[2];
1620
                }else{
1621
                        src1= srcptr[1]+(y1-y/2)*stride[1];
1622
                        src2= srcptr[2]+(y1-y/2)*stride[2];
1623
                }
1624
                hcscale(uvbuf1, dstUVw, src1, src2, srcWidth, s_xinc2);
1625

    
1626
                // the min() is required to avoid reuseing lines which where not available
1627
                s_last_y1pos= MIN(y1, y/2+h/2-1);
1628
        }
1629

    
1630
        if(dstbpp==12) //YV12
1631
                yuv2yuv(buf0, buf1, uvbuf0, uvbuf1, dest, uDest, vDest, dstw, yalpha, uvalpha);
1632
        else if(ABS(s_yinc - 0x10000) < 10)
1633
                yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1634
        else
1635
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1636

    
1637
#ifdef HAVE_MMX
1638
            b16Dither= b16Dither1;
1639
        b16Dither1= b16Dither2;
1640
        b16Dither2= b16Dither;
1641

    
1642
        g16Dither= g16Dither1;
1643
        g16Dither1= g16Dither2;
1644
        g16Dither2= g16Dither;
1645
#endif
1646
  }
1647

    
1648
#ifdef HAVE_MMX
1649
        __asm __volatile(SFENCE:::"memory");
1650
        __asm __volatile(EMMS:::"memory");
1651
#endif
1652
}
1653

    
1654

    
1655
void SwScale_Init(){
1656
    // generating tables:
1657
    int i;
1658
    for(i=0;i<256;i++){
1659
        clip_table[i]=0;
1660
        clip_table[i+256]=i;
1661
        clip_table[i+512]=255;
1662
        yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
1663
        yuvtab_3343[i]=0x3343*(i-128);
1664
        yuvtab_0c92[i]=-0x0c92*(i-128);
1665
        yuvtab_1a1e[i]=-0x1a1e*(i-128);
1666
        yuvtab_40cf[i]=0x40cf*(i-128);
1667
    }
1668

    
1669
    for(i=0; i<768; i++)
1670
    {
1671
            int v= clip_table[i];
1672
        clip_table16b[i]= v>>3;
1673
        clip_table16g[i]= (v<<3)&0x07E0;
1674
        clip_table16r[i]= (v<<8)&0xF800;
1675
        clip_table15b[i]= v>>3;
1676
        clip_table15g[i]= (v<<2)&0x03E0;
1677
        clip_table15r[i]= (v<<7)&0x7C00;
1678
    }
1679

    
1680
}