Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ d604bab9

History | View | Annotate | Download (41.5 KB)

1 31190492 Arpi
2
// Software scaling and colorspace conversion routines for MPlayer
3
4 afa569af Arpi
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5 783e9cc9 Michael Niedermayer
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6 d604bab9 Michael Niedermayer
// the parts written by michael are under GNU GPL
7 783e9cc9 Michael Niedermayer
8 d3f41512 Michael Niedermayer
#include <inttypes.h>
9
#include "../config.h"
10 d604bab9 Michael Niedermayer
#include "swscale.h"
11 d3f41512 Michael Niedermayer
12 783e9cc9 Michael Niedermayer
//#undef HAVE_MMX2
13 d3f41512 Michael Niedermayer
//#undef HAVE_MMX
14 783e9cc9 Michael Niedermayer
//#undef ARCH_X86
15 d604bab9 Michael Niedermayer
#define DITHER1XBPP
16
int fullUVIpol=0;
17
//disables the unscaled height version
18
int allwaysIpol=0;
19 d3f41512 Michael Niedermayer
20
#define RET 0xC3 //near return opcode
21 783e9cc9 Michael Niedermayer
/*
22
NOTES
23 d3f41512 Michael Niedermayer

24 d604bab9 Michael Niedermayer
known BUGS with known cause (no bugreports please!, but patches are welcome :) )
25
horizontal MMX2 scaler reads 1-7 samples too much (might cause a sig11)
26

27
Supported output formats BGR15 BGR16 BGR24 BGR32 (15,24 are untested)
28
BGR15 & BGR16 MMX verions support dithering
29
Special versions: fast Y 1:1 scaling (no interpolation in y direction)
30 31190492 Arpi

31 783e9cc9 Michael Niedermayer
TODO
32 d604bab9 Michael Niedermayer
more intelligent missalignment avoidance for the horizontal scaler
33 783e9cc9 Michael Niedermayer
*/
34 31190492 Arpi
35 d604bab9 Michael Niedermayer
#define ABS(a) ((a) > 0 ? (a) : (-(a)))
36
37
#ifdef HAVE_MMX2
38
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39
#elif defined (HAVE_3DNOW)
40
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41
#endif
42 d3f41512 Michael Niedermayer
43 d604bab9 Michael Niedermayer
#ifdef HAVE_MMX2
44
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45
#else
46
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47
#endif
48
49
50
#ifdef HAVE_MMX
51
static uint64_t __attribute__((aligned(8))) yCoeff=    0x2568256825682568LL;
52
static uint64_t __attribute__((aligned(8))) ubCoeff=   0x3343334333433343LL;
53
static uint64_t __attribute__((aligned(8))) vrCoeff=   0x40cf40cf40cf40cfLL;
54
static uint64_t __attribute__((aligned(8))) ugCoeff=   0xE5E2E5E2E5E2E5E2LL;
55
static uint64_t __attribute__((aligned(8))) vgCoeff=   0xF36EF36EF36EF36ELL;
56
static uint64_t __attribute__((aligned(8))) w400=      0x0400040004000400LL;
57
static uint64_t __attribute__((aligned(8))) w80=       0x0080008000800080LL;
58
static uint64_t __attribute__((aligned(8))) w10=       0x0010001000100010LL;
59
static uint64_t __attribute__((aligned(8))) bm00001111=0x00000000FFFFFFFFLL;
60
static uint64_t __attribute__((aligned(8))) bm00000111=0x0000000000FFFFFFLL;
61
static uint64_t __attribute__((aligned(8))) bm11111000=0xFFFFFFFFFF000000LL;
62
63
static uint64_t __attribute__((aligned(8))) b16Dither= 0x0004000400040004LL;
64
static uint64_t __attribute__((aligned(8))) b16Dither1=0x0004000400040004LL;
65
static uint64_t __attribute__((aligned(8))) b16Dither2=0x0602060206020602LL;
66
static uint64_t __attribute__((aligned(8))) g16Dither= 0x0002000200020002LL;
67
static uint64_t __attribute__((aligned(8))) g16Dither1=0x0002000200020002LL;
68
static uint64_t __attribute__((aligned(8))) g16Dither2=0x0301030103010301LL;
69
70
static uint64_t __attribute__((aligned(8))) b16Mask=   0x001F001F001F001FLL;
71
static uint64_t __attribute__((aligned(8))) g16Mask=   0x07E007E007E007E0LL;
72
static uint64_t __attribute__((aligned(8))) r16Mask=   0xF800F800F800F800LL;
73
static uint64_t __attribute__((aligned(8))) b15Mask=   0x001F001F001F001FLL;
74
static uint64_t __attribute__((aligned(8))) g15Mask=   0x03E003E003E003E0LL;
75
static uint64_t __attribute__((aligned(8))) r15Mask=   0x7C007C007C007C00LL;
76
77
static uint64_t __attribute__((aligned(8))) temp0;
78
static uint64_t __attribute__((aligned(8))) asm_yalpha1;
79
static uint64_t __attribute__((aligned(8))) asm_uvalpha1;
80
#endif
81 783e9cc9 Michael Niedermayer
82
// temporary storage for 4 yuv lines:
83
// 16bit for now (mmx likes it more compact)
84 d604bab9 Michael Niedermayer
#ifdef HAVE_MMX
85
static uint16_t __attribute__((aligned(8))) pix_buf_y[4][2048];
86
static uint16_t __attribute__((aligned(8))) pix_buf_uv[2][2048*2];
87
#else
88 783e9cc9 Michael Niedermayer
static uint16_t pix_buf_y[4][2048];
89
static uint16_t pix_buf_uv[2][2048*2];
90 d604bab9 Michael Niedermayer
#endif
91 783e9cc9 Michael Niedermayer
92
// clipping helper table for C implementations:
93
static unsigned char clip_table[768];
94
95
// yuv->rgb conversion tables:
96
static    int yuvtab_2568[256];
97
static    int yuvtab_3343[256];
98
static    int yuvtab_0c92[256];
99
static    int yuvtab_1a1e[256];
100
static    int yuvtab_40cf[256];
101
102
103 d3f41512 Michael Niedermayer
static uint8_t funnyYCode[10000];
104
static uint8_t funnyUVCode[10000];
105
106 d604bab9 Michael Niedermayer
#define FULL_YSCALEYUV2RGB \
107
                "pxor %%mm7, %%mm7                \n\t"\
108
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
109
                "punpcklwd %%mm6, %%mm6                \n\t"\
110
                "punpcklwd %%mm6, %%mm6                \n\t"\
111
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
112
                "punpcklwd %%mm5, %%mm5                \n\t"\
113
                "punpcklwd %%mm5, %%mm5                \n\t"\
114
                "xorl %%eax, %%eax                \n\t"\
115
                "1:                                \n\t"\
116
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
117
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
118
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
119
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
120
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
121
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
122
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
123
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
124
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
125
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
126
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
127
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
128
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
129
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
130
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
131
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
132
                "psubw w400, %%mm3                \n\t" /* 8(U-128)*/\
133
                "pmulhw yCoeff, %%mm1                \n\t"\
134
\
135
\
136
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
137
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
138
                "pmulhw ubCoeff, %%mm3                \n\t"\
139
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
140
                "pmulhw ugCoeff, %%mm2                \n\t"\
141
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
142
                "psubw w400, %%mm0                \n\t" /* (V-128)8*/\
143
\
144
\
145
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
146
                "pmulhw vrCoeff, %%mm0                \n\t"\
147
                "pmulhw vgCoeff, %%mm4                \n\t"\
148
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
149
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
150
                "packuswb %%mm3, %%mm3                \n\t"\
151
\
152
                "packuswb %%mm0, %%mm0                \n\t"\
153
                "paddw %%mm4, %%mm2                \n\t"\
154
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
155
\
156
                "packuswb %%mm1, %%mm1                \n\t"
157
158
#define YSCALEYUV2RGB \
159
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
160
                "punpcklwd %%mm6, %%mm6                \n\t"\
161
                "punpcklwd %%mm6, %%mm6                \n\t"\
162
                "movq %%mm6, asm_yalpha1        \n\t"\
163
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
164
                "punpcklwd %%mm5, %%mm5                \n\t"\
165
                "punpcklwd %%mm5, %%mm5                \n\t"\
166
                "movq %%mm5, asm_uvalpha1        \n\t"\
167
                "xorl %%eax, %%eax                \n\t"\
168
                "1:                                \n\t"\
169
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
170
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
171
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
172
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
173
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
174
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
175
                "movq asm_uvalpha1, %%mm0        \n\t"\
176
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
177
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
178
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
179
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
180
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
181
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
182
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
183
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
184
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
185
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
186
                "pmulhw ugCoeff, %%mm3                \n\t"\
187
                "pmulhw vgCoeff, %%mm4                \n\t"\
188
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
189
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
190
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
191
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
192
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
193
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
194
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
195
                "pmulhw asm_yalpha1, %%mm0        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
196
                "pmulhw asm_yalpha1, %%mm6        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
197
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
198
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
199
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
200
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
201
                "pmulhw ubCoeff, %%mm2                \n\t"\
202
                "pmulhw vrCoeff, %%mm5                \n\t"\
203
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
204
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
205
                "pmulhw yCoeff, %%mm1                \n\t"\
206
                "pmulhw yCoeff, %%mm7                \n\t"\
207
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
208
                "paddw %%mm3, %%mm4                \n\t"\
209
                "movq %%mm2, %%mm0                \n\t"\
210
                "movq %%mm5, %%mm6                \n\t"\
211
                "movq %%mm4, %%mm3                \n\t"\
212
                "punpcklwd %%mm2, %%mm2                \n\t"\
213
                "punpcklwd %%mm5, %%mm5                \n\t"\
214
                "punpcklwd %%mm4, %%mm4                \n\t"\
215
                "paddw %%mm1, %%mm2                \n\t"\
216
                "paddw %%mm1, %%mm5                \n\t"\
217
                "paddw %%mm1, %%mm4                \n\t"\
218
                "punpckhwd %%mm0, %%mm0                \n\t"\
219
                "punpckhwd %%mm6, %%mm6                \n\t"\
220
                "punpckhwd %%mm3, %%mm3                \n\t"\
221
                "paddw %%mm7, %%mm0                \n\t"\
222
                "paddw %%mm7, %%mm6                \n\t"\
223
                "paddw %%mm7, %%mm3                \n\t"\
224
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
225
                "packuswb %%mm0, %%mm2                \n\t"\
226
                "packuswb %%mm6, %%mm5                \n\t"\
227
                "packuswb %%mm3, %%mm4                \n\t"\
228
                "pxor %%mm7, %%mm7                \n\t"
229
230
#define YSCALEYUV2RGB1 \
231
                "xorl %%eax, %%eax                \n\t"\
232
                "1:                                \n\t"\
233
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
234
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
235
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
236
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
237
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
238
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
239
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
240
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
241
                "pmulhw ugCoeff, %%mm3                \n\t"\
242
                "pmulhw vgCoeff, %%mm4                \n\t"\
243
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
244
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
245
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
246
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
247
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
248
                "pmulhw ubCoeff, %%mm2                \n\t"\
249
                "pmulhw vrCoeff, %%mm5                \n\t"\
250
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
251
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
252
                "pmulhw yCoeff, %%mm1                \n\t"\
253
                "pmulhw yCoeff, %%mm7                \n\t"\
254
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
255
                "paddw %%mm3, %%mm4                \n\t"\
256
                "movq %%mm2, %%mm0                \n\t"\
257
                "movq %%mm5, %%mm6                \n\t"\
258
                "movq %%mm4, %%mm3                \n\t"\
259
                "punpcklwd %%mm2, %%mm2                \n\t"\
260
                "punpcklwd %%mm5, %%mm5                \n\t"\
261
                "punpcklwd %%mm4, %%mm4                \n\t"\
262
                "paddw %%mm1, %%mm2                \n\t"\
263
                "paddw %%mm1, %%mm5                \n\t"\
264
                "paddw %%mm1, %%mm4                \n\t"\
265
                "punpckhwd %%mm0, %%mm0                \n\t"\
266
                "punpckhwd %%mm6, %%mm6                \n\t"\
267
                "punpckhwd %%mm3, %%mm3                \n\t"\
268
                "paddw %%mm7, %%mm0                \n\t"\
269
                "paddw %%mm7, %%mm6                \n\t"\
270
                "paddw %%mm7, %%mm3                \n\t"\
271
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
272
                "packuswb %%mm0, %%mm2                \n\t"\
273
                "packuswb %%mm6, %%mm5                \n\t"\
274
                "packuswb %%mm3, %%mm4                \n\t"\
275
                "pxor %%mm7, %%mm7                \n\t"
276
277
#define WRITEBGR32 \
278
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
279
                        "movq %%mm2, %%mm1                \n\t" /* B */\
280
                        "movq %%mm5, %%mm6                \n\t" /* R */\
281
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
282
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
283
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
284
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
285
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
286
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
287
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
288
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
289
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
290
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
291
\
292
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
293
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
294
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
295
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
296
\
297
                        "addl $8, %%eax                        \n\t"\
298
                        "cmpl %5, %%eax                        \n\t"\
299
                        " jb 1b                                \n\t"
300
301
#define WRITEBGR16 \
302
                        "movq %%mm2, %%mm1                \n\t" /* B */\
303
                        "movq %%mm4, %%mm3                \n\t" /* G */\
304
                        "movq %%mm5, %%mm6                \n\t" /* R */\
305
\
306
                        "punpcklbw %%mm7, %%mm3                \n\t" /* 0G0G0G0G */\
307
                        "punpcklbw %%mm7, %%mm2                \n\t" /* 0B0B0B0B */\
308
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R */\
309
\
310
                        "psrlw $3, %%mm2                \n\t"\
311
                        "psllw $3, %%mm3                \n\t"\
312
                        "psllw $8, %%mm5                \n\t"\
313
\
314
                        "pand g16Mask, %%mm3                \n\t"\
315
                        "pand r16Mask, %%mm5                \n\t"\
316
\
317
                        "por %%mm3, %%mm2                \n\t"\
318
                        "por %%mm5, %%mm2                \n\t"\
319
\
320
                        "punpckhbw %%mm7, %%mm4                \n\t" /* 0G0G0G0G */\
321
                        "punpckhbw %%mm7, %%mm1                \n\t" /* 0B0B0B0B */\
322
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R */\
323
\
324
                        "psrlw $3, %%mm1                \n\t"\
325
                        "psllw $3, %%mm4                \n\t"\
326
                        "psllw $8, %%mm6                \n\t"\
327
\
328
                        "pand g16Mask, %%mm4                \n\t"\
329
                        "pand r16Mask, %%mm6                \n\t"\
330
\
331
                        "por %%mm4, %%mm1                \n\t"\
332
                        "por %%mm6, %%mm1                \n\t"\
333
\
334
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
335
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
336
\
337
                        "addl $8, %%eax                        \n\t"\
338
                        "cmpl %5, %%eax                        \n\t"\
339
                        " jb 1b                                \n\t"
340
341
#define WRITEBGR15 \
342
                        "movq %%mm2, %%mm1                \n\t" /* B */\
343
                        "movq %%mm4, %%mm3                \n\t" /* G */\
344
                        "movq %%mm5, %%mm6                \n\t" /* R */\
345
\
346
                        "punpcklbw %%mm7, %%mm3                \n\t" /* 0G0G0G0G */\
347
                        "punpcklbw %%mm7, %%mm2                \n\t" /* 0B0B0B0B */\
348
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R */\
349
\
350
                        "psrlw $3, %%mm2                \n\t"\
351
                        "psllw $2, %%mm3                \n\t"\
352
                        "psllw $7, %%mm5                \n\t"\
353
\
354
                        "pand g15Mask, %%mm3                \n\t"\
355
                        "pand r15Mask, %%mm5                \n\t"\
356
\
357
                        "por %%mm3, %%mm2                \n\t"\
358
                        "por %%mm5, %%mm2                \n\t"\
359
\
360
                        "punpckhbw %%mm7, %%mm4                \n\t" /* 0G0G0G0G */\
361
                        "punpckhbw %%mm7, %%mm1                \n\t" /* 0B0B0B0B */\
362
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R */\
363
\
364
                        "psrlw $3, %%mm1                \n\t"\
365
                        "psllw $2, %%mm4                \n\t"\
366
                        "psllw $7, %%mm6                \n\t"\
367
\
368
                        "pand g15Mask, %%mm4                \n\t"\
369
                        "pand r15Mask, %%mm6                \n\t"\
370
\
371
                        "por %%mm4, %%mm1                \n\t"\
372
                        "por %%mm6, %%mm1                \n\t"\
373
\
374
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
375
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
376
\
377
                        "addl $8, %%eax                        \n\t"\
378
                        "cmpl %5, %%eax                        \n\t"\
379
                        " jb 1b                                \n\t"
380
// FIXME find a faster way to shuffle it to BGR24
381
#define WRITEBGR24 \
382
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
383
                        "movq %%mm2, %%mm1                \n\t" /* B */\
384
                        "movq %%mm5, %%mm6                \n\t" /* R */\
385
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
386
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
387
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
388
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
389
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
390
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
391
                        "punpcklbw %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
392
                        "punpckhbw %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
393
                        "punpcklbw %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
394
                        "punpckhbw %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
395
\
396
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
397
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
398
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 0 */\
399
                        "pand bm11111000, %%mm0                \n\t" /* 00RGB000 0.5 */\
400
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
401
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
402
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
403
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
404
\
405
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
406
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
407
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
408
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
409
                        "pand bm00001111, %%mm2                \n\t" /* 0000RGBR 1 */\
410
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
411
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
412
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 2 */\
413
                        "pand bm11111000, %%mm1                \n\t" /* 00RGB000 2.5 */\
414
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
415
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
416
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
417
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
418
\
419
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
420
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
421
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
422
                        "pand bm00000111, %%mm5                \n\t" /* 00000RGB 3 */\
423
                        "pand bm11111000, %%mm3                \n\t" /* 00RGB000 3.5 */\
424
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
425
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
426
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
427
\
428
                        "leal (%%eax, %%eax, 2), %%ebx        \n\t"\
429
                        MOVNTQ(%%mm0, (%4, %%ebx))\
430
                        MOVNTQ(%%mm2, 8(%4, %%ebx))\
431
                        MOVNTQ(%%mm3, 16(%4, %%ebx))\
432
\
433
                        "addl $8, %%eax                        \n\t"\
434
                        "cmpl %5, %%eax                        \n\t"\
435
                        " jb 1b                                \n\t"
436
437
438
/**
439
 * vertical scale YV12 to RGB
440
 */
441
static inline void yuv2rgbX(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
442
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
443
{
444
        int yalpha1=yalpha^4095;
445
        int uvalpha1=uvalpha^4095;
446
        int i;
447
448
        if(fullUVIpol)
449
        {
450
451
#ifdef HAVE_MMX
452
                if(dstbpp == 32)
453
                {
454
                        asm volatile(
455
456
457
FULL_YSCALEYUV2RGB
458
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
459
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
460
461
                        "movq %%mm3, %%mm1                \n\t"
462
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
463
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
464
465
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
466
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
467
468
                        "addl $4, %%eax                        \n\t"
469
                        "cmpl %5, %%eax                        \n\t"
470
                        " jb 1b                                \n\t"
471
472
473
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
474
                        "m" (yalpha1), "m" (uvalpha1)
475
                        : "%eax"
476
                        );
477
                }
478
                else if(dstbpp==24)
479
                {
480
                        asm volatile(
481
482
FULL_YSCALEYUV2RGB
483
484
                                                                // lsb ... msb
485
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
486
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
487
488
                        "movq %%mm3, %%mm1                \n\t"
489
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
490
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
491
492
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
493
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
494
                        "pand bm00000111, %%mm2                \n\t" // BGR00000
495
                        "pand bm11111000, %%mm3                \n\t" // 000BGR00
496
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
497
                        "movq %%mm1, %%mm2                \n\t"
498
                        "psllq $48, %%mm1                \n\t" // 000000BG
499
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
500
501
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
502
                        "psrld $16, %%mm2                \n\t" // R000R000
503
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
504
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
505
506
                        "movl %4, %%ebx                        \n\t"
507
                        "addl %%eax, %%ebx                \n\t"
508
509
#ifdef HAVE_MMX2
510
                        //FIXME Alignment
511
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
512
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
513
#else
514
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
515
                        "psrlq $32, %%mm3                \n\t"
516
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
517
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
518
#endif
519
                        "addl $4, %%eax                        \n\t"
520
                        "cmpl %5, %%eax                        \n\t"
521
                        " jb 1b                                \n\t"
522
523
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstw),
524
                        "m" (yalpha1), "m" (uvalpha1)
525
                        : "%eax", "%ebx"
526
                        );
527
                }
528
                else if(dstbpp==15)
529
                {
530
                        asm volatile(
531
532
FULL_YSCALEYUV2RGB
533
#ifdef DITHER1XBPP
534
                        "paddusb b16Dither, %%mm1        \n\t"
535
                        "paddusb b16Dither, %%mm0        \n\t"
536
                        "paddusb b16Dither, %%mm3        \n\t"
537
#endif
538
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
539
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
540
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
541
542
                        "psrlw $3, %%mm3                \n\t"
543
                        "psllw $2, %%mm1                \n\t"
544
                        "psllw $7, %%mm0                \n\t"
545
                        "pand g15Mask, %%mm1                \n\t"
546
                        "pand r15Mask, %%mm0                \n\t"
547
548
                        "por %%mm3, %%mm1                \n\t"
549
                        "por %%mm1, %%mm0                \n\t"
550
551
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
552
553
                        "addl $4, %%eax                        \n\t"
554
                        "cmpl %5, %%eax                        \n\t"
555
                        " jb 1b                                \n\t"
556
557
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
558
                        "m" (yalpha1), "m" (uvalpha1)
559
                        : "%eax"
560
                        );
561
                }
562
                else if(dstbpp==16)
563
                {
564
                        asm volatile(
565
566
FULL_YSCALEYUV2RGB
567
#ifdef DITHER1XBPP
568
                        "paddusb g16Dither, %%mm1        \n\t"
569
                        "paddusb b16Dither, %%mm0        \n\t"
570
                        "paddusb b16Dither, %%mm3        \n\t"
571
#endif
572
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
573
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
574
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
575
576
                        "psrlw $3, %%mm3                \n\t"
577
                        "psllw $3, %%mm1                \n\t"
578
                        "psllw $8, %%mm0                \n\t"
579
                        "pand g16Mask, %%mm1                \n\t"
580
                        "pand r16Mask, %%mm0                \n\t"
581
582
                        "por %%mm3, %%mm1                \n\t"
583
                        "por %%mm1, %%mm0                \n\t"
584
585
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
586
587
                        "addl $4, %%eax                        \n\t"
588
                        "cmpl %5, %%eax                        \n\t"
589
                        " jb 1b                                \n\t"
590
591
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
592
                        "m" (yalpha1), "m" (uvalpha1)
593
                        : "%eax"
594
                        );
595
                }
596
#else
597
                if(dstbpp==32 || dstbpp==24)
598
                {
599
                        for(i=0;i<dstw;i++){
600
                                // vertical linear interpolation && yuv2rgb in a single step:
601
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
602
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
603
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
604
                                dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
605
                                dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
606
                                dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
607
                                dest+=dstbpp>>3;
608
                        }
609
                }
610
                else if(dstbpp==16)
611
                {
612
                        for(i=0;i<dstw;i++){
613
                                // vertical linear interpolation && yuv2rgb in a single step:
614
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
615
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
616
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
617
618
                                ((uint16_t*)dest)[0] =
619
                                        (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
620
                                        (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
621
                                        (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
622
                                dest+=2;
623
                        }
624
                }
625
                else if(dstbpp==15)
626
                {
627
                        for(i=0;i<dstw;i++){
628
                                // vertical linear interpolation && yuv2rgb in a single step:
629
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
630
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
631
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
632
633
                                ((uint16_t*)dest)[0] =
634
                                        (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
635
                                        (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
636
                                        (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
637
                                dest+=2;
638
                        }
639
                }
640
#endif
641
        }//FULL_UV_IPOL
642
        else
643
        {
644
#ifdef HAVE_MMX
645
                if(dstbpp == 32)
646
                {
647
                        asm volatile(
648
                                YSCALEYUV2RGB
649
                                WRITEBGR32
650
651
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
652
                        "m" (yalpha1), "m" (uvalpha1)
653
                        : "%eax"
654
                        );
655
                }
656
                else if(dstbpp==24)
657
                {
658
                        asm volatile(
659
                                YSCALEYUV2RGB
660
                                WRITEBGR24
661
662
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
663
                        "m" (yalpha1), "m" (uvalpha1)
664
                        : "%eax", "%ebx"
665
                        );
666
                }
667
                else if(dstbpp==15)
668
                {
669
                        asm volatile(
670
                                YSCALEYUV2RGB
671
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
672
#ifdef DITHER1XBPP
673
                                "paddusb b16Dither, %%mm2        \n\t"
674
                                "paddusb b16Dither, %%mm4        \n\t"
675
                                "paddusb b16Dither, %%mm5        \n\t"
676
#endif
677
678
                                WRITEBGR15
679
680
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
681
                        "m" (yalpha1), "m" (uvalpha1)
682
                        : "%eax"
683
                        );
684
                }
685
                else if(dstbpp==16)
686
                {
687
                        asm volatile(
688
                                YSCALEYUV2RGB
689
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
690
#ifdef DITHER1XBPP
691
                                "paddusb g16Dither, %%mm2        \n\t"
692
                                "paddusb b16Dither, %%mm4        \n\t"
693
                                "paddusb b16Dither, %%mm5        \n\t"
694
#endif
695
696
                                WRITEBGR16
697
698
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
699
                        "m" (yalpha1), "m" (uvalpha1)
700
                        : "%eax"
701
                        );
702
                }
703
#else
704
//FIXME unroll C loop and dont recalculate UV
705
                if(dstbpp==32 || dstbpp==24)
706
                {
707
                        for(i=0;i<dstw;i++){
708
                                // vertical linear interpolation && yuv2rgb in a single step:
709
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
710
                                int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
711
                                int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
712
                                dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
713
                                dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
714
                                dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
715
                                dest+=dstbpp>>3;
716
                        }
717
                }
718
                else if(dstbpp==16)
719
                {
720
                        for(i=0;i<dstw;i++){
721
                                // vertical linear interpolation && yuv2rgb in a single step:
722
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
723
                                int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
724
                                int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
725
726
                                ((uint16_t*)dest)[0] =
727
                                        (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
728
                                        (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
729
                                        (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
730
                                dest+=2;
731
                        }
732
                }
733
                else if(dstbpp==15)
734
                {
735
                        for(i=0;i<dstw;i++){
736
                                // vertical linear interpolation && yuv2rgb in a single step:
737
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
738
                                int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
739
                                int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
740
741
                                ((uint16_t*)dest)[0] =
742
                                        (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
743
                                        (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
744
                                        (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
745
                                dest+=2;
746
                        }
747
                }
748
#endif
749
        } //!FULL_UV_IPOL
750
}
751
752
/**
753
 * YV12 to RGB without scaling or interpolating
754
 */
755
static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
756
                            uint8_t *dest, int dstw, int yalpha, int uvalpha, int dstbpp)
757
{
758
        int yalpha1=yalpha^4095;
759
        int uvalpha1=uvalpha^4095;
760
        int i;
761
        if(fullUVIpol || allwaysIpol)
762
        {
763
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
764
                return;
765
        }
766
#ifdef HAVE_MMX
767
                if(dstbpp == 32)
768
                {
769
                        asm volatile(
770
                                YSCALEYUV2RGB1
771
                                WRITEBGR32
772
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
773
                        "m" (yalpha1), "m" (uvalpha1)
774
                        : "%eax"
775
                        );
776
                }
777
                else if(dstbpp==24)
778
                {
779
                        asm volatile(
780
                                YSCALEYUV2RGB1
781
                                WRITEBGR24
782
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
783
                        "m" (yalpha1), "m" (uvalpha1)
784
                        : "%eax", "%ebx"
785
                        );
786
                }
787
                else if(dstbpp==15)
788
                {
789
                        asm volatile(
790
                                YSCALEYUV2RGB1
791
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
792
#ifdef DITHER1XBPP
793
                                "paddusb b16Dither, %%mm2        \n\t"
794
                                "paddusb b16Dither, %%mm4        \n\t"
795
                                "paddusb b16Dither, %%mm5        \n\t"
796
#endif
797
                                WRITEBGR15
798
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
799
                        "m" (yalpha1), "m" (uvalpha1)
800
                        : "%eax"
801
                        );
802
                }
803
                else if(dstbpp==16)
804
                {
805
                        asm volatile(
806
                                YSCALEYUV2RGB1
807
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
808
#ifdef DITHER1XBPP
809
                                "paddusb g16Dither, %%mm2        \n\t"
810
                                "paddusb b16Dither, %%mm4        \n\t"
811
                                "paddusb b16Dither, %%mm5        \n\t"
812
#endif
813
814
                                WRITEBGR16
815
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstw),
816
                        "m" (yalpha1), "m" (uvalpha1)
817
                        : "%eax"
818
                        );
819
                }
820
#else
821
//FIXME unroll C loop and dont recalculate UV
822
                if(dstbpp==32 || dstbpp==24)
823
                {
824
                        for(i=0;i<dstw;i++){
825
                                // vertical linear interpolation && yuv2rgb in a single step:
826
                                int Y=yuvtab_2568[buf0[i]>>7];
827
                                int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
828
                                int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
829
                                dest[0]=clip_table[((Y + yuvtab_3343[U]) >>13)];
830
                                dest[1]=clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)];
831
                                dest[2]=clip_table[((Y + yuvtab_40cf[V]) >>13)];
832
                                dest+=dstbpp>>3;
833
                        }
834
                }
835
                else if(dstbpp==16)
836
                {
837
                        for(i=0;i<dstw;i++){
838
                                // vertical linear interpolation && yuv2rgb in a single step:
839
                                int Y=yuvtab_2568[buf0[i]>>7];
840
                                int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
841
                                int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
842
843
                                ((uint16_t*)dest)[0] =
844
                                        (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
845
                                        (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<3)&0x07E0 |
846
                                        (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<8)&0xF800;
847
                                dest+=2;
848
                        }
849
                }
850
                else if(dstbpp==15)
851
                {
852
                        for(i=0;i<dstw;i++){
853
                                // vertical linear interpolation && yuv2rgb in a single step:
854
                                int Y=yuvtab_2568[buf0[i]>>7];
855
                                int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
856
                                int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
857
858
                                ((uint16_t*)dest)[0] =
859
                                        (clip_table[((Y + yuvtab_3343[U]) >>13)]>>3) |
860
                                        (clip_table[((Y + yuvtab_0c92[V] + yuvtab_1a1e[U]) >>13)]<<2)&0x03E0 |
861
                                        (clip_table[((Y + yuvtab_40cf[V]) >>13)]<<7)&0x7C00;
862
                                dest+=2;
863
                        }
864
                }
865
#endif
866
}
867
868
869
870 d3f41512 Michael Niedermayer
871 31190492 Arpi
// *** bilinear scaling and yuv->rgb conversion of yv12 slices:
872
// *** Note: it's called multiple times while decoding a frame, first time y==0
873
// *** Designed to upscale, but may work for downscale too.
874 44f9179b Arpi
// s_xinc = (src_width << 16) / dst_width
875 31190492 Arpi
// s_yinc = (src_height << 16) / dst_height
876
void SwScale_YV12slice_brg24(unsigned char* srcptr[],int stride[], int y, int h,
877
                             unsigned char* dstptr, int dststride, int dstw, int dstbpp,
878
                             unsigned int s_xinc,unsigned int s_yinc){
879
880
// scaling factors:
881
//static int s_yinc=(vo_dga_src_height<<16)/vo_dga_vp_height;
882
//static int s_xinc=(vo_dga_src_width<<8)/vo_dga_vp_width;
883
884 783e9cc9 Michael Niedermayer
unsigned int s_xinc2;
885 31190492 Arpi
886 783e9cc9 Michael Niedermayer
static int s_srcypos; // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
887 31190492 Arpi
static int s_ypos;
888 783e9cc9 Michael Niedermayer
889
// last horzontally interpolated lines, used to avoid unnecessary calculations
890 31190492 Arpi
static int s_last_ypos;
891 783e9cc9 Michael Niedermayer
static int s_last_y1pos;
892
893 d3f41512 Michael Niedermayer
static int static_dstw;
894
895
#ifdef HAVE_MMX2
896 783e9cc9 Michael Niedermayer
// used to detect a horizontal size change
897 d3f41512 Michael Niedermayer
static int old_dstw= -1;
898
static int old_s_xinc= -1;
899
#endif
900 d604bab9 Michael Niedermayer
901 0f25d72b Michael Niedermayer
int canMMX2BeUsed=0;
902
int srcWidth= (dstw*s_xinc + 0x8000)>>16;
903 d604bab9 Michael Niedermayer
int dstUVw= fullUVIpol ? dstw : dstw/2;
904
905 31190492 Arpi
906 d3fda508 Michael Niedermayer
#ifdef HAVE_MMX2
907 0f25d72b Michael Niedermayer
canMMX2BeUsed= (s_xinc <= 0x10000 && (dstw&31)==0 && (srcWidth&15)==0) ? 1 : 0;
908 d3fda508 Michael Niedermayer
#endif
909
910 0f25d72b Michael Niedermayer
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
911
// n-2 is the last chrominance sample available
912
// FIXME this is not perfect, but noone shuld notice the difference, the more correct variant
913
// would be like the vertical one, but that would require some special code for the
914
// first and last pixel
915
if(canMMX2BeUsed)         s_xinc+= 20;
916
else                        s_xinc = ((srcWidth-2)<<16)/(dstw-2) - 20;
917
918 d604bab9 Michael Niedermayer
if(fullUVIpol)         s_xinc2= s_xinc>>1;
919
else                s_xinc2= s_xinc;
920 b3a134b6 Michael Niedermayer
  // force calculation of the horizontal interpolation of the first line
921
  s_last_ypos=-99;
922
  s_last_y1pos=-99;
923
924 31190492 Arpi
  if(y==0){
925 84adc106 Michael Niedermayer
      s_srcypos=-0x8000;
926 783e9cc9 Michael Niedermayer
      s_ypos=0;
927 d3f41512 Michael Niedermayer
#ifdef HAVE_MMX2
928
// cant downscale !!!
929 783e9cc9 Michael Niedermayer
        if((old_s_xinc != s_xinc || old_dstw!=dstw) && canMMX2BeUsed)
930 d3f41512 Michael Niedermayer
        {
931
                uint8_t *fragment;
932
                int imm8OfPShufW1;
933
                int imm8OfPShufW2;
934
                int fragmentLength;
935
936
                int xpos, xx, xalpha, i;
937
938
                old_s_xinc= s_xinc;
939
                old_dstw= dstw;
940
941
                static_dstw= dstw;
942
943
                // create an optimized horizontal scaling routine
944
945
                //code fragment
946
947
                asm volatile(
948
                        "jmp 9f                                \n\t"
949
                // Begin
950
                        "0:                                \n\t"
951 783e9cc9 Michael Niedermayer
                        "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
952 d3f41512 Michael Niedermayer
                        "movq %%mm0, %%mm1                \n\t"
953
                        "psrlq $8, %%mm0                \n\t"
954
                        "punpcklbw %%mm7, %%mm1        \n\t"
955 783e9cc9 Michael Niedermayer
                        "movq %%mm2, %%mm3                \n\t"
956 d3f41512 Michael Niedermayer
                        "punpcklbw %%mm7, %%mm0        \n\t"
957 783e9cc9 Michael Niedermayer
                        "addw %%bx, %%cx                \n\t" //2*xalpha += (4*s_xinc)&0xFFFF
958 d3f41512 Michael Niedermayer
                        "pshufw $0xFF, %%mm1, %%mm1        \n\t"
959
                        "1:                                \n\t"
960 783e9cc9 Michael Niedermayer
                        "adcl %%edx, %%esi                \n\t" //xx+= (4*s_xinc)>>16 + carry
961 d3f41512 Michael Niedermayer
                        "pshufw $0xFF, %%mm0, %%mm0        \n\t"
962
                        "2:                                \n\t"
963 783e9cc9 Michael Niedermayer
                        "psrlw $9, %%mm3                \n\t"
964 d3f41512 Michael Niedermayer
                        "psubw %%mm1, %%mm0                \n\t"
965 783e9cc9 Michael Niedermayer
                        "pmullw %%mm3, %%mm0                \n\t"
966
                        "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
967 d3f41512 Michael Niedermayer
                        "psllw $7, %%mm1                \n\t"
968
                        "paddw %%mm1, %%mm0                \n\t"
969
970 783e9cc9 Michael Niedermayer
                        "movq %%mm0, (%%edi, %%eax)        \n\t"
971 d3f41512 Michael Niedermayer
972
                        "addl $8, %%eax                        \n\t"
973
                // End
974
                        "9:                                \n\t"
975
//                "int $3\n\t"
976
                        "leal 0b, %0                        \n\t"
977
                        "leal 1b, %1                        \n\t"
978
                        "leal 2b, %2                        \n\t"
979
                        "decl %1                        \n\t"
980
                        "decl %2                        \n\t"
981
                        "subl %0, %1                        \n\t"
982
                        "subl %0, %2                        \n\t"
983
                        "leal 9b, %3                        \n\t"
984
                        "subl %0, %3                        \n\t"
985
                        :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
986
                         "=r" (fragmentLength)
987
                );
988
989 0f25d72b Michael Niedermayer
                xpos= 0; //s_xinc/2 - 0x8000; // difference between pixel centers
990 783e9cc9 Michael Niedermayer
991
                /* choose xinc so that all 8 parts fit exactly
992
                   Note: we cannot use just 1 part because it would not fit in the code cache */
993 0f25d72b Michael Niedermayer
//                s_xinc2_diff= -((((s_xinc2*(dstw/8))&0xFFFF))/(dstw/8))-10;
994 783e9cc9 Michael Niedermayer
//                s_xinc_diff= -((((s_xinc*(dstw/8))&0xFFFF))/(dstw/8));
995
#ifdef ALT_ERROR
996 0f25d72b Michael Niedermayer
//                s_xinc2_diff+= ((0x10000/(dstw/8)));
997 783e9cc9 Michael Niedermayer
#endif
998 0f25d72b Michael Niedermayer
//                s_xinc_diff= s_xinc2_diff*2;
999 783e9cc9 Michael Niedermayer
1000 0f25d72b Michael Niedermayer
//                s_xinc2+= s_xinc2_diff;
1001
//                s_xinc+= s_xinc_diff;
1002 d3fda508 Michael Niedermayer
1003 0f25d72b Michael Niedermayer
//                old_s_xinc= s_xinc;
1004 d3fda508 Michael Niedermayer
1005 d3f41512 Michael Niedermayer
                for(i=0; i<dstw/8; i++)
1006
                {
1007 783e9cc9 Michael Niedermayer
                        int xx=xpos>>16;
1008 d3f41512 Michael Niedermayer
1009
                        if((i&3) == 0)
1010
                        {
1011
                                int a=0;
1012 783e9cc9 Michael Niedermayer
                                int b=((xpos+s_xinc)>>16) - xx;
1013
                                int c=((xpos+s_xinc*2)>>16) - xx;
1014
                                int d=((xpos+s_xinc*3)>>16) - xx;
1015 d3f41512 Michael Niedermayer
1016
                                memcpy(funnyYCode + fragmentLength*i/4, fragment, fragmentLength);
1017
1018
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW1]=
1019
                                funnyYCode[fragmentLength*i/4 + imm8OfPShufW2]=
1020
                                        a | (b<<2) | (c<<4) | (d<<6);
1021
1022 d604bab9 Michael Niedermayer
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1023
                                // crossing a cache line
1024
                                if(d<3) funnyYCode[fragmentLength*i/4 + 1]= 0x6E;
1025
1026 d3f41512 Michael Niedermayer
                                funnyYCode[fragmentLength*(i+4)/4]= RET;
1027
                        }
1028
                        xpos+=s_xinc;
1029
                }
1030
1031 0f25d72b Michael Niedermayer
                xpos= 0; //s_xinc2/2 - 0x10000; // difference between centers of chrom samples
1032 d604bab9 Michael Niedermayer
                for(i=0; i<dstUVw/8; i++)
1033 d3f41512 Michael Niedermayer
                {
1034 783e9cc9 Michael Niedermayer
                        int xx=xpos>>16;
1035 d3f41512 Michael Niedermayer
1036
                        if((i&3) == 0)
1037
                        {
1038
                                int a=0;
1039 783e9cc9 Michael Niedermayer
                                int b=((xpos+s_xinc2)>>16) - xx;
1040
                                int c=((xpos+s_xinc2*2)>>16) - xx;
1041
                                int d=((xpos+s_xinc2*3)>>16) - xx;
1042 d3f41512 Michael Niedermayer
1043
                                memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
1044
1045
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
1046
                                funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
1047
                                        a | (b<<2) | (c<<4) | (d<<6);
1048
1049 d604bab9 Michael Niedermayer
                                // if we dont need to read 8 bytes than dont :), reduces the chance of
1050
                                // crossing a cache line
1051
                                if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
1052
1053 d3f41512 Michael Niedermayer
                                funnyUVCode[fragmentLength*(i+4)/4]= RET;
1054
                        }
1055
                        xpos+=s_xinc2;
1056
                }
1057
//                funnyCode[0]= RET;
1058
        }
1059 783e9cc9 Michael Niedermayer
1060
#endif // HAVE_MMX2
1061 31190492 Arpi
  } // reset counters
1062 d3f41512 Michael Niedermayer
1063 d3fda508 Michael Niedermayer
1064 31190492 Arpi
  while(1){
1065
    unsigned char *dest=dstptr+dststride*s_ypos;
1066 783e9cc9 Michael Niedermayer
    int y0=(s_srcypos + 0xFFFF)>>16;  // first luminance source line number below the dst line
1067
        // points to the dst Pixels center in the source (0 is the center of pixel 0,0 in src)
1068
    int srcuvpos= s_srcypos + s_yinc/2 - 0x8000;
1069
    int y1=(srcuvpos + 0x1FFFF)>>17; // first chrominance source line number below the dst line
1070 d604bab9 Michael Niedermayer
    int yalpha=((s_srcypos-1)&0xFFFF)>>4;
1071
    int uvalpha=((srcuvpos-1)&0x1FFFF)>>5;
1072 783e9cc9 Michael Niedermayer
    uint16_t *buf0=pix_buf_y[y0&1];                // top line of the interpolated slice
1073
    uint16_t *buf1=pix_buf_y[((y0+1)&1)];        // bottom line of the interpolated slice
1074
    uint16_t *uvbuf0=pix_buf_uv[y1&1];                // top line of the interpolated slice
1075
    uint16_t *uvbuf1=pix_buf_uv[(y1+1)&1];        // bottom line of the interpolated slice
1076 31190492 Arpi
    int i;
1077
1078 783e9cc9 Michael Niedermayer
    // if this is before the first line than use only the first src line
1079
    if(y0==0) buf0= buf1;
1080
    if(y1==0) uvbuf0= uvbuf1; // yes we do have to check this, its not the same as y0==0
1081
1082
    if(y0>=y+h) break; // FIXME wrong, skips last lines, but they are dupliactes anyway
1083
1084
    // if this is after the last line than use only the last src line
1085
    if(y0>=y+h)
1086
    {
1087
        buf1= buf0;
1088
        s_last_ypos=y0;
1089
    }
1090
    if(y1>=(y+h)/2)
1091
    {
1092
        uvbuf1= uvbuf0;
1093
        s_last_y1pos=y1;
1094
    }
1095
1096 31190492 Arpi
1097
    s_ypos++; s_srcypos+=s_yinc;
1098
1099 783e9cc9 Michael Niedermayer
    //only interpolate the src line horizontally if we didnt do it allready
1100 31190492 Arpi
    if(s_last_ypos!=y0){
1101
      unsigned char *src=srcptr[0]+(y0-y)*stride[0];
1102
      unsigned int xpos=0;
1103
      s_last_ypos=y0;
1104
      // *** horizontal scale Y line to temp buffer
1105 783e9cc9 Michael Niedermayer
#ifdef ARCH_X86
1106 d3f41512 Michael Niedermayer
1107 783e9cc9 Michael Niedermayer
#ifdef HAVE_MMX2
1108
        if(canMMX2BeUsed)
1109
        {
1110
                asm volatile(
1111
                        "pxor %%mm7, %%mm7                \n\t"
1112
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1113
                        "movd %5, %%mm6                        \n\t" // s_xinc&0xFFFF
1114
                        "punpcklwd %%mm6, %%mm6                \n\t"
1115
                        "punpcklwd %%mm6, %%mm6                \n\t"
1116
                        "movq %%mm6, %%mm2                \n\t"
1117
                        "psllq $16, %%mm2                \n\t"
1118
                        "paddw %%mm6, %%mm2                \n\t"
1119
                        "psllq $16, %%mm2                \n\t"
1120
                        "paddw %%mm6, %%mm2                \n\t"
1121
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=s_xinc&0xFF
1122
                        "movq %%mm2, temp0                \n\t"
1123
                        "movd %4, %%mm6                        \n\t" //(s_xinc*4)&0xFFFF
1124
                        "punpcklwd %%mm6, %%mm6                \n\t"
1125
                        "punpcklwd %%mm6, %%mm6                \n\t"
1126
                        "xorl %%eax, %%eax                \n\t" // i
1127
                        "movl %0, %%esi                        \n\t" // src
1128
                        "movl %1, %%edi                        \n\t" // buf1
1129
                        "movl %3, %%edx                        \n\t" // (s_xinc*4)>>16
1130
                        "xorl %%ecx, %%ecx                \n\t"
1131
                        "xorl %%ebx, %%ebx                \n\t"
1132
                        "movw %4, %%bx                        \n\t" // (s_xinc*4)&0xFFFF
1133 d604bab9 Michael Niedermayer
#ifdef HAVE_MMX2
1134
#define FUNNY_Y_CODE \
1135
                        "prefetchnta 1024(%%esi)        \n\t"\
1136
                        "prefetchnta 1056(%%esi)        \n\t"\
1137
                        "prefetchnta 1088(%%esi)        \n\t"\
1138
                        "call funnyYCode                \n\t"\
1139
                        "movq temp0, %%mm2                \n\t"\
1140 783e9cc9 Michael Niedermayer
                        "xorl %%ecx, %%ecx                \n\t"
1141 d604bab9 Michael Niedermayer
#else
1142
#define FUNNY_Y_CODE \
1143
                        "call funnyYCode                \n\t"\
1144
                        "movq temp0, %%mm2                \n\t"\
1145 783e9cc9 Michael Niedermayer
                        "xorl %%ecx, %%ecx                \n\t"
1146 d604bab9 Michael Niedermayer
#endif
1147
FUNNY_Y_CODE
1148
FUNNY_Y_CODE
1149
FUNNY_Y_CODE
1150
FUNNY_Y_CODE
1151
FUNNY_Y_CODE
1152
FUNNY_Y_CODE
1153
FUNNY_Y_CODE
1154
FUNNY_Y_CODE
1155
1156 783e9cc9 Michael Niedermayer
                        :: "m" (src), "m" (buf1), "m" (dstw), "m" ((s_xinc*4)>>16),
1157
                        "m" ((s_xinc*4)&0xFFFF), "m" (s_xinc&0xFFFF)
1158
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1159
                );
1160 0f25d72b Michael Niedermayer
                for(i=dstw-1; (i*s_xinc)>>16 >=srcWidth-1; i--) buf1[i] = src[srcWidth-1]*128;
1161 783e9cc9 Michael Niedermayer
        }
1162
        else
1163
        {
1164
#endif
1165 d604bab9 Michael Niedermayer
        //NO MMX just normal asm ...
1166 d3f41512 Michael Niedermayer
        asm volatile(
1167
                "xorl %%eax, %%eax                \n\t" // i
1168
                "xorl %%ebx, %%ebx                \n\t" // xx
1169
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1170
                "1:                                \n\t"
1171
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
1172
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1173
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1174
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1175 783e9cc9 Michael Niedermayer
                "shll $16, %%edi                \n\t"
1176 d3f41512 Michael Niedermayer
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1177 a6e972a2 Michael Niedermayer
                "movl %1, %%edi                        \n\t"
1178 783e9cc9 Michael Niedermayer
                "shrl $9, %%esi                        \n\t"
1179 a6e972a2 Michael Niedermayer
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1180 783e9cc9 Michael Niedermayer
                "addw %4, %%cx                        \n\t" //2*xalpha += s_xinc&0xFF
1181 d3f41512 Michael Niedermayer
                "adcl %3, %%ebx                        \n\t" //xx+= s_xinc>>8 + carry
1182
1183
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
1184
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1185
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1186
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1187 783e9cc9 Michael Niedermayer
                "shll $16, %%edi                \n\t"
1188 d3f41512 Michael Niedermayer
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1189 a6e972a2 Michael Niedermayer
                "movl %1, %%edi                        \n\t"
1190 783e9cc9 Michael Niedermayer
                "shrl $9, %%esi                        \n\t"
1191 a6e972a2 Michael Niedermayer
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
1192 783e9cc9 Michael Niedermayer
                "addw %4, %%cx                        \n\t" //2*xalpha += s_xinc&0xFF
1193 d3f41512 Michael Niedermayer
                "adcl %3, %%ebx                        \n\t" //xx+= s_xinc>>8 + carry
1194
1195
1196
                "addl $2, %%eax                        \n\t"
1197
                "cmpl %2, %%eax                        \n\t"
1198
                " jb 1b                                \n\t"
1199
1200
1201 783e9cc9 Michael Niedermayer
                :: "r" (src), "m" (buf1), "m" (dstw), "m" (s_xinc>>16), "m" (s_xinc&0xFFFF)
1202 d3f41512 Michael Niedermayer
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1203
                );
1204 783e9cc9 Michael Niedermayer
#ifdef HAVE_MMX2
1205
        } //if MMX2 cant be used
1206
#endif
1207 d3f41512 Michael Niedermayer
#else
1208 31190492 Arpi
      for(i=0;i<dstw;i++){
1209 783e9cc9 Michael Niedermayer
        register unsigned int xx=xpos>>16;
1210
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
1211 d3f41512 Michael Niedermayer
        buf1[i]=(src[xx]*(xalpha^127)+src[xx+1]*xalpha);
1212 31190492 Arpi
        xpos+=s_xinc;
1213
      }
1214 d3f41512 Michael Niedermayer
#endif
1215 783e9cc9 Michael Niedermayer
    }
1216 31190492 Arpi
      // *** horizontal scale U and V lines to temp buffer
1217 783e9cc9 Michael Niedermayer
    if(s_last_y1pos!=y1){
1218 31190492 Arpi
        unsigned char *src1=srcptr[1]+(y1-y/2)*stride[1];
1219
        unsigned char *src2=srcptr[2]+(y1-y/2)*stride[2];
1220 783e9cc9 Michael Niedermayer
        int xpos=0;
1221
        s_last_y1pos= y1;
1222
#ifdef ARCH_X86
1223 d3f41512 Michael Niedermayer
#ifdef HAVE_MMX2
1224 783e9cc9 Michael Niedermayer
        if(canMMX2BeUsed)
1225
        {
1226
                asm volatile(
1227 d3f41512 Michael Niedermayer
                "pxor %%mm7, %%mm7                \n\t"
1228
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1229 783e9cc9 Michael Niedermayer
                "movd %5, %%mm6                        \n\t" // s_xinc&0xFFFF
1230 d3f41512 Michael Niedermayer
                "punpcklwd %%mm6, %%mm6                \n\t"
1231
                "punpcklwd %%mm6, %%mm6                \n\t"
1232
                "movq %%mm6, %%mm2                \n\t"
1233
                "psllq $16, %%mm2                \n\t"
1234 783e9cc9 Michael Niedermayer
                "paddw %%mm6, %%mm2                \n\t"
1235 d3f41512 Michael Niedermayer
                "psllq $16, %%mm2                \n\t"
1236 783e9cc9 Michael Niedermayer
                "paddw %%mm6, %%mm2                \n\t"
1237
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=s_xinc&0xFFFF
1238 d3f41512 Michael Niedermayer
                "movq %%mm2, temp0                \n\t"
1239 783e9cc9 Michael Niedermayer
                "movd %4, %%mm6                        \n\t" //(s_xinc*4)&0xFFFF
1240 d3f41512 Michael Niedermayer
                "punpcklwd %%mm6, %%mm6                \n\t"
1241
                "punpcklwd %%mm6, %%mm6                \n\t"
1242
                "xorl %%eax, %%eax                \n\t" // i
1243
                "movl %0, %%esi                        \n\t" // src
1244
                "movl %1, %%edi                        \n\t" // buf1
1245 783e9cc9 Michael Niedermayer
                "movl %3, %%edx                        \n\t" // (s_xinc*4)>>16
1246 d3f41512 Michael Niedermayer
                "xorl %%ecx, %%ecx                \n\t"
1247 783e9cc9 Michael Niedermayer
                "xorl %%ebx, %%ebx                \n\t"
1248
                "movw %4, %%bx                        \n\t" // (s_xinc*4)&0xFFFF
1249
1250 d604bab9 Michael Niedermayer
#ifdef HAVE_MMX2
1251 783e9cc9 Michael Niedermayer
#define FUNNYUVCODE \
1252 d604bab9 Michael Niedermayer
                        "prefetchnta 1024(%%esi)        \n\t"\
1253
                        "prefetchnta 1056(%%esi)        \n\t"\
1254
                        "prefetchnta 1088(%%esi)        \n\t"\
1255
                        "call funnyUVCode                \n\t"\
1256
                        "movq temp0, %%mm2                \n\t"\
1257
                        "xorl %%ecx, %%ecx                \n\t"
1258
#else
1259
#define FUNNYUVCODE \
1260
                        "call funnyUVCode                \n\t"\
1261
                        "movq temp0, %%mm2                \n\t"\
1262
                        "xorl %%ecx, %%ecx                \n\t"
1263
#endif
1264 783e9cc9 Michael Niedermayer
1265
FUNNYUVCODE
1266
FUNNYUVCODE
1267
FUNNYUVCODE
1268
FUNNYUVCODE
1269
1270
FUNNYUVCODE
1271
FUNNYUVCODE
1272
FUNNYUVCODE
1273
FUNNYUVCODE
1274
1275
1276 d3f41512 Michael Niedermayer
                "xorl %%eax, %%eax                \n\t" // i
1277
                "movl %6, %%esi                        \n\t" // src
1278
                "movl %1, %%edi                        \n\t" // buf1
1279
                "addl $4096, %%edi                \n\t"
1280
1281 783e9cc9 Michael Niedermayer
FUNNYUVCODE
1282
FUNNYUVCODE
1283
FUNNYUVCODE
1284
FUNNYUVCODE
1285
1286
FUNNYUVCODE
1287
FUNNYUVCODE
1288
FUNNYUVCODE
1289
FUNNYUVCODE
1290
1291 d604bab9 Michael Niedermayer
                :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" ((s_xinc2*4)>>16),
1292 783e9cc9 Michael Niedermayer
                  "m" ((s_xinc2*4)&0xFFFF), "m" (s_xinc2&0xFFFF), "m" (src2)
1293 d3f41512 Michael Niedermayer
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1294
        );
1295 d604bab9 Michael Niedermayer
                for(i=dstUVw-1; (i*s_xinc2)>>16 >=srcWidth/2-1; i--)
1296 0f25d72b Michael Niedermayer
                {
1297
                        uvbuf1[i] = src1[srcWidth/2-1]*128;
1298
                        uvbuf1[i+2048] = src2[srcWidth/2-1]*128;
1299
                }
1300 783e9cc9 Michael Niedermayer
        }
1301
        else
1302
        {
1303
#endif
1304 d3f41512 Michael Niedermayer
        asm volatile(
1305
                "xorl %%eax, %%eax                \n\t" // i
1306
                "xorl %%ebx, %%ebx                \n\t" // xx
1307
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1308
                "1:                                \n\t"
1309 a6e972a2 Michael Niedermayer
                "movl %0, %%esi                        \n\t"
1310
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
1311
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
1312 d3f41512 Michael Niedermayer
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1313
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1314 783e9cc9 Michael Niedermayer
                "shll $16, %%edi                \n\t"
1315 d3f41512 Michael Niedermayer
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1316
                "movl %1, %%edi                        \n\t"
1317 783e9cc9 Michael Niedermayer
                "shrl $9, %%esi                        \n\t"
1318 d3f41512 Michael Niedermayer
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1319
1320
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
1321
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
1322
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1323
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1324 783e9cc9 Michael Niedermayer
                "shll $16, %%edi                \n\t"
1325 d3f41512 Michael Niedermayer
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1326
                "movl %1, %%edi                        \n\t"
1327 783e9cc9 Michael Niedermayer
                "shrl $9, %%esi                        \n\t"
1328 d3f41512 Michael Niedermayer
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1329
1330 783e9cc9 Michael Niedermayer
                "addw %4, %%cx                        \n\t" //2*xalpha += s_xinc&0xFF
1331 d3f41512 Michael Niedermayer
                "adcl %3, %%ebx                        \n\t" //xx+= s_xinc>>8 + carry
1332
                "addl $1, %%eax                        \n\t"
1333
                "cmpl %2, %%eax                        \n\t"
1334
                " jb 1b                                \n\t"
1335
1336 d604bab9 Michael Niedermayer
                :: "m" (src1), "m" (uvbuf1), "m" (dstUVw), "m" (s_xinc2>>16), "m" (s_xinc2&0xFFFF),
1337 d3f41512 Michael Niedermayer
                "r" (src2)
1338
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1339
                );
1340 783e9cc9 Michael Niedermayer
#ifdef HAVE_MMX2
1341
        } //if MMX2 cant be used
1342
#endif
1343 d3f41512 Michael Niedermayer
#else
1344 d604bab9 Michael Niedermayer
      for(i=0;i<dstUVw;i++){
1345 783e9cc9 Michael Niedermayer
          register unsigned int xx=xpos>>16;
1346
          register unsigned int xalpha=(xpos&0xFFFF)>>9;
1347 d3f41512 Michael Niedermayer
          uvbuf1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1348
          uvbuf1[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1349 31190492 Arpi
          xpos+=s_xinc2;
1350
      }
1351 783e9cc9 Michael Niedermayer
#endif
1352 84adc106 Michael Niedermayer
        // if this is the line before the first line
1353
        if(s_srcypos == s_xinc - 0x8000)
1354
        {
1355
                s_srcypos= s_yinc/2 - 0x8000;
1356
                continue;
1357
        }
1358 31190492 Arpi
    }
1359
1360 d604bab9 Michael Niedermayer
        if(ABS(s_yinc - 0x10000) < 10)
1361
                yuv2rgb1(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1362
        else
1363
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
1364 783e9cc9 Michael Niedermayer
1365 d3f41512 Michael Niedermayer
#ifdef HAVE_MMX
1366 d604bab9 Michael Niedermayer
            b16Dither= b16Dither1;
1367 d3f41512 Michael Niedermayer
        b16Dither1= b16Dither2;
1368
        b16Dither2= b16Dither;
1369
1370
        g16Dither= g16Dither1;
1371
        g16Dither1= g16Dither2;
1372
        g16Dither2= g16Dither;
1373 d604bab9 Michael Niedermayer
#endif
1374 31190492 Arpi
  }
1375
1376 fffd2e0a Arpi
#ifdef HAVE_3DNOW
1377
        asm volatile("femms");
1378
#elif defined (HAVE_MMX)
1379
        asm volatile("emms");
1380
#endif
1381 31190492 Arpi
}
1382
1383
1384
void SwScale_Init(){
1385
    // generating tables:
1386
    int i;
1387
    for(i=0;i<256;i++){
1388
        clip_table[i]=0;
1389
        clip_table[i+256]=i;
1390
        clip_table[i+512]=255;
1391
        yuvtab_2568[i]=(0x2568*(i-16))+(256<<13);
1392
        yuvtab_3343[i]=0x3343*(i-128);
1393
        yuvtab_0c92[i]=-0x0c92*(i-128);
1394
        yuvtab_1a1e[i]=-0x1a1e*(i-128);
1395
        yuvtab_40cf[i]=0x40cf*(i-128);
1396
    }
1397
1398
}