Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ 51fdf2de

History | View | Annotate | Download (76 KB)

1

    
2
// Software scaling and colorspace conversion routines for MPlayer
3

    
4
// Orginal C implementation by A'rpi/ESP-team <arpi@thot.banki.hu>
5
// current version mostly by Michael Niedermayer (michaelni@gmx.at)
6
// the parts written by michael are under GNU GPL
7

    
8
#undef MOVNTQ
9
#undef PAVGB
10
#undef PREFETCH
11
#undef PREFETCHW
12
#undef EMMS
13
#undef SFENCE
14

    
15
#ifdef HAVE_3DNOW
16
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
17
#define EMMS     "femms"
18
#else
19
#define EMMS     "emms"
20
#endif
21

    
22
#ifdef HAVE_3DNOW
23
#define PREFETCH  "prefetch"
24
#define PREFETCHW "prefetchw"
25
#elif defined ( HAVE_MMX2 )
26
#define PREFETCH "prefetchnta"
27
#define PREFETCHW "prefetcht0"
28
#else
29
#define PREFETCH "/nop"
30
#define PREFETCHW "/nop"
31
#endif
32

    
33
#ifdef HAVE_MMX2
34
#define SFENCE "sfence"
35
#else
36
#define SFENCE "/nop"
37
#endif
38

    
39
#ifdef HAVE_MMX2
40
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
41
#elif defined (HAVE_3DNOW)
42
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
43
#endif
44

    
45
#ifdef HAVE_MMX2
46
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
47
#else
48
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
49
#endif
50

    
51

    
52
#define YSCALEYUV2YV12X(x) \
53
                        "xorl %%eax, %%eax                \n\t"\
54
                        "pxor %%mm3, %%mm3                \n\t"\
55
                        "pxor %%mm4, %%mm4                \n\t"\
56
                        "movl %0, %%edx                        \n\t"\
57
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
58
                        "1:                                \n\t"\
59
                        "movl (%1, %%edx, 4), %%esi        \n\t"\
60
                        "movq (%2, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
61
                        "movq " #x "(%%esi, %%eax, 2), %%mm2        \n\t" /* srcData */\
62
                        "movq 8+" #x "(%%esi, %%eax, 2), %%mm5        \n\t" /* srcData */\
63
                        "pmulhw %%mm0, %%mm2                \n\t"\
64
                        "pmulhw %%mm0, %%mm5                \n\t"\
65
                        "paddw %%mm2, %%mm3                \n\t"\
66
                        "paddw %%mm5, %%mm4                \n\t"\
67
                        "addl $1, %%edx                        \n\t"\
68
                        " jnz 1b                        \n\t"\
69
                        "psraw $3, %%mm3                \n\t"\
70
                        "psraw $3, %%mm4                \n\t"\
71
                        "packuswb %%mm4, %%mm3                \n\t"\
72
                        MOVNTQ(%%mm3, (%3, %%eax))\
73
                        "addl $8, %%eax                        \n\t"\
74
                        "cmpl %4, %%eax                        \n\t"\
75
                        "pxor %%mm3, %%mm3                \n\t"\
76
                        "pxor %%mm4, %%mm4                \n\t"\
77
                        "movl %0, %%edx                        \n\t"\
78
                        "jb 1b                                \n\t"
79

    
80
#define YSCALEYUV2YV121 \
81
                        "movl %2, %%eax                        \n\t"\
82
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
83
                        "1:                                \n\t"\
84
                        "movq (%0, %%eax, 2), %%mm0        \n\t"\
85
                        "movq 8(%0, %%eax, 2), %%mm1        \n\t"\
86
                        "psraw $7, %%mm0                \n\t"\
87
                        "psraw $7, %%mm1                \n\t"\
88
                        "packuswb %%mm1, %%mm0                \n\t"\
89
                        MOVNTQ(%%mm0, (%1, %%eax))\
90
                        "addl $8, %%eax                        \n\t"\
91
                        "jnc 1b                                \n\t"
92

    
93
/*
94
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
95
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
96
                           "r" (dest), "m" (dstW),
97
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
98
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
99
*/
100
#define YSCALEYUV2RGBX \
101
                "xorl %%eax, %%eax                \n\t"\
102
                ".balign 16                        \n\t"\
103
                "1:                                \n\t"\
104
                "movl %1, %%edx                        \n\t" /* -chrFilterSize */\
105
                "movl %3, %%ebx                        \n\t" /* chrMmxFilter+lumFilterSize */\
106
                "movl %7, %%ecx                        \n\t" /* chrSrc+lumFilterSize */\
107
                "pxor %%mm3, %%mm3                \n\t"\
108
                "pxor %%mm4, %%mm4                \n\t"\
109
                "2:                                \n\t"\
110
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
111
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
112
                "movq (%%esi, %%eax), %%mm2        \n\t" /* UsrcData */\
113
                "movq 4096(%%esi, %%eax), %%mm5        \n\t" /* VsrcData */\
114
                "pmulhw %%mm0, %%mm2                \n\t"\
115
                "pmulhw %%mm0, %%mm5                \n\t"\
116
                "paddw %%mm2, %%mm3                \n\t"\
117
                "paddw %%mm5, %%mm4                \n\t"\
118
                "addl $1, %%edx                        \n\t"\
119
                " jnz 2b                        \n\t"\
120
\
121
                "movl %0, %%edx                        \n\t" /* -lumFilterSize */\
122
                "movl %2, %%ebx                        \n\t" /* lumMmxFilter+lumFilterSize */\
123
                "movl %6, %%ecx                        \n\t" /* lumSrc+lumFilterSize */\
124
                "pxor %%mm1, %%mm1                \n\t"\
125
                "pxor %%mm7, %%mm7                \n\t"\
126
                "2:                                \n\t"\
127
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
128
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
129
                "movq (%%esi, %%eax, 2), %%mm2        \n\t" /* Y1srcData */\
130
                "movq 8(%%esi, %%eax, 2), %%mm5        \n\t" /* Y2srcData */\
131
                "pmulhw %%mm0, %%mm2                \n\t"\
132
                "pmulhw %%mm0, %%mm5                \n\t"\
133
                "paddw %%mm2, %%mm1                \n\t"\
134
                "paddw %%mm5, %%mm7                \n\t"\
135
                "addl $1, %%edx                        \n\t"\
136
                " jnz 2b                        \n\t"\
137
\
138
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
139
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
140
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
141
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
142
                "pmulhw ugCoeff, %%mm3                \n\t"\
143
                "pmulhw vgCoeff, %%mm4                \n\t"\
144
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
145
                "pmulhw ubCoeff, %%mm2                \n\t"\
146
                "pmulhw vrCoeff, %%mm5                \n\t"\
147
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
148
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
149
                "pmulhw yCoeff, %%mm1                \n\t"\
150
                "pmulhw yCoeff, %%mm7                \n\t"\
151
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
152
                "paddw %%mm3, %%mm4                \n\t"\
153
                "movq %%mm2, %%mm0                \n\t"\
154
                "movq %%mm5, %%mm6                \n\t"\
155
                "movq %%mm4, %%mm3                \n\t"\
156
                "punpcklwd %%mm2, %%mm2                \n\t"\
157
                "punpcklwd %%mm5, %%mm5                \n\t"\
158
                "punpcklwd %%mm4, %%mm4                \n\t"\
159
                "paddw %%mm1, %%mm2                \n\t"\
160
                "paddw %%mm1, %%mm5                \n\t"\
161
                "paddw %%mm1, %%mm4                \n\t"\
162
                "punpckhwd %%mm0, %%mm0                \n\t"\
163
                "punpckhwd %%mm6, %%mm6                \n\t"\
164
                "punpckhwd %%mm3, %%mm3                \n\t"\
165
                "paddw %%mm7, %%mm0                \n\t"\
166
                "paddw %%mm7, %%mm6                \n\t"\
167
                "paddw %%mm7, %%mm3                \n\t"\
168
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
169
                "packuswb %%mm0, %%mm2                \n\t"\
170
                "packuswb %%mm6, %%mm5                \n\t"\
171
                "packuswb %%mm3, %%mm4                \n\t"\
172
                "pxor %%mm7, %%mm7                \n\t"
173

    
174
#define FULL_YSCALEYUV2RGB \
175
                "pxor %%mm7, %%mm7                \n\t"\
176
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
177
                "punpcklwd %%mm6, %%mm6                \n\t"\
178
                "punpcklwd %%mm6, %%mm6                \n\t"\
179
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
180
                "punpcklwd %%mm5, %%mm5                \n\t"\
181
                "punpcklwd %%mm5, %%mm5                \n\t"\
182
                "xorl %%eax, %%eax                \n\t"\
183
                ".balign 16                        \n\t"\
184
                "1:                                \n\t"\
185
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
186
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
187
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
188
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
189
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
190
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
191
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
192
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
193
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
194
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
195
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
196
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
197
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
198
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
199
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
200
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
201
                "psubw w400, %%mm3                \n\t" /* 8(U-128)*/\
202
                "pmulhw yCoeff, %%mm1                \n\t"\
203
\
204
\
205
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
206
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
207
                "pmulhw ubCoeff, %%mm3                \n\t"\
208
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
209
                "pmulhw ugCoeff, %%mm2                \n\t"\
210
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
211
                "psubw w400, %%mm0                \n\t" /* (V-128)8*/\
212
\
213
\
214
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
215
                "pmulhw vrCoeff, %%mm0                \n\t"\
216
                "pmulhw vgCoeff, %%mm4                \n\t"\
217
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
218
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
219
                "packuswb %%mm3, %%mm3                \n\t"\
220
\
221
                "packuswb %%mm0, %%mm0                \n\t"\
222
                "paddw %%mm4, %%mm2                \n\t"\
223
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
224
\
225
                "packuswb %%mm1, %%mm1                \n\t"
226

    
227
#define YSCALEYUV2RGB \
228
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
229
                "punpcklwd %%mm6, %%mm6                \n\t"\
230
                "punpcklwd %%mm6, %%mm6                \n\t"\
231
                "movq %%mm6, asm_yalpha1        \n\t"\
232
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
233
                "punpcklwd %%mm5, %%mm5                \n\t"\
234
                "punpcklwd %%mm5, %%mm5                \n\t"\
235
                "movq %%mm5, asm_uvalpha1        \n\t"\
236
                "xorl %%eax, %%eax                \n\t"\
237
                ".balign 16                        \n\t"\
238
                "1:                                \n\t"\
239
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
240
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
241
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
242
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
243
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
244
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
245
                "movq asm_uvalpha1, %%mm0        \n\t"\
246
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
247
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
248
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
249
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
250
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
251
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
252
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
253
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
254
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
255
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
256
                "pmulhw ugCoeff, %%mm3                \n\t"\
257
                "pmulhw vgCoeff, %%mm4                \n\t"\
258
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
259
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
260
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
261
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
262
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
263
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
264
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
265
                "pmulhw asm_yalpha1, %%mm0        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
266
                "pmulhw asm_yalpha1, %%mm6        \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
267
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
268
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
269
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
270
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
271
                "pmulhw ubCoeff, %%mm2                \n\t"\
272
                "pmulhw vrCoeff, %%mm5                \n\t"\
273
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
274
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
275
                "pmulhw yCoeff, %%mm1                \n\t"\
276
                "pmulhw yCoeff, %%mm7                \n\t"\
277
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
278
                "paddw %%mm3, %%mm4                \n\t"\
279
                "movq %%mm2, %%mm0                \n\t"\
280
                "movq %%mm5, %%mm6                \n\t"\
281
                "movq %%mm4, %%mm3                \n\t"\
282
                "punpcklwd %%mm2, %%mm2                \n\t"\
283
                "punpcklwd %%mm5, %%mm5                \n\t"\
284
                "punpcklwd %%mm4, %%mm4                \n\t"\
285
                "paddw %%mm1, %%mm2                \n\t"\
286
                "paddw %%mm1, %%mm5                \n\t"\
287
                "paddw %%mm1, %%mm4                \n\t"\
288
                "punpckhwd %%mm0, %%mm0                \n\t"\
289
                "punpckhwd %%mm6, %%mm6                \n\t"\
290
                "punpckhwd %%mm3, %%mm3                \n\t"\
291
                "paddw %%mm7, %%mm0                \n\t"\
292
                "paddw %%mm7, %%mm6                \n\t"\
293
                "paddw %%mm7, %%mm3                \n\t"\
294
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
295
                "packuswb %%mm0, %%mm2                \n\t"\
296
                "packuswb %%mm6, %%mm5                \n\t"\
297
                "packuswb %%mm3, %%mm4                \n\t"\
298
                "pxor %%mm7, %%mm7                \n\t"
299

    
300
#define YSCALEYUV2RGB1 \
301
                "xorl %%eax, %%eax                \n\t"\
302
                ".balign 16                        \n\t"\
303
                "1:                                \n\t"\
304
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
305
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
306
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
307
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
308
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
309
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
310
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
311
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
312
                "pmulhw ugCoeff, %%mm3                \n\t"\
313
                "pmulhw vgCoeff, %%mm4                \n\t"\
314
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
315
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
316
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
317
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
318
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
319
                "pmulhw ubCoeff, %%mm2                \n\t"\
320
                "pmulhw vrCoeff, %%mm5                \n\t"\
321
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
322
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
323
                "pmulhw yCoeff, %%mm1                \n\t"\
324
                "pmulhw yCoeff, %%mm7                \n\t"\
325
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
326
                "paddw %%mm3, %%mm4                \n\t"\
327
                "movq %%mm2, %%mm0                \n\t"\
328
                "movq %%mm5, %%mm6                \n\t"\
329
                "movq %%mm4, %%mm3                \n\t"\
330
                "punpcklwd %%mm2, %%mm2                \n\t"\
331
                "punpcklwd %%mm5, %%mm5                \n\t"\
332
                "punpcklwd %%mm4, %%mm4                \n\t"\
333
                "paddw %%mm1, %%mm2                \n\t"\
334
                "paddw %%mm1, %%mm5                \n\t"\
335
                "paddw %%mm1, %%mm4                \n\t"\
336
                "punpckhwd %%mm0, %%mm0                \n\t"\
337
                "punpckhwd %%mm6, %%mm6                \n\t"\
338
                "punpckhwd %%mm3, %%mm3                \n\t"\
339
                "paddw %%mm7, %%mm0                \n\t"\
340
                "paddw %%mm7, %%mm6                \n\t"\
341
                "paddw %%mm7, %%mm3                \n\t"\
342
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
343
                "packuswb %%mm0, %%mm2                \n\t"\
344
                "packuswb %%mm6, %%mm5                \n\t"\
345
                "packuswb %%mm3, %%mm4                \n\t"\
346
                "pxor %%mm7, %%mm7                \n\t"
347

    
348
// do vertical chrominance interpolation
349
#define YSCALEYUV2RGB1b \
350
                "xorl %%eax, %%eax                \n\t"\
351
                ".balign 16                        \n\t"\
352
                "1:                                \n\t"\
353
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
354
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
355
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
356
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
357
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
358
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
359
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
360
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
361
                "psubw w400, %%mm3                \n\t" /* (U-128)8*/\
362
                "psubw w400, %%mm4                \n\t" /* (V-128)8*/\
363
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
364
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
365
                "pmulhw ugCoeff, %%mm3                \n\t"\
366
                "pmulhw vgCoeff, %%mm4                \n\t"\
367
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
368
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
369
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
370
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
371
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
372
                "pmulhw ubCoeff, %%mm2                \n\t"\
373
                "pmulhw vrCoeff, %%mm5                \n\t"\
374
                "psubw w80, %%mm1                \n\t" /* 8(Y-16)*/\
375
                "psubw w80, %%mm7                \n\t" /* 8(Y-16)*/\
376
                "pmulhw yCoeff, %%mm1                \n\t"\
377
                "pmulhw yCoeff, %%mm7                \n\t"\
378
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
379
                "paddw %%mm3, %%mm4                \n\t"\
380
                "movq %%mm2, %%mm0                \n\t"\
381
                "movq %%mm5, %%mm6                \n\t"\
382
                "movq %%mm4, %%mm3                \n\t"\
383
                "punpcklwd %%mm2, %%mm2                \n\t"\
384
                "punpcklwd %%mm5, %%mm5                \n\t"\
385
                "punpcklwd %%mm4, %%mm4                \n\t"\
386
                "paddw %%mm1, %%mm2                \n\t"\
387
                "paddw %%mm1, %%mm5                \n\t"\
388
                "paddw %%mm1, %%mm4                \n\t"\
389
                "punpckhwd %%mm0, %%mm0                \n\t"\
390
                "punpckhwd %%mm6, %%mm6                \n\t"\
391
                "punpckhwd %%mm3, %%mm3                \n\t"\
392
                "paddw %%mm7, %%mm0                \n\t"\
393
                "paddw %%mm7, %%mm6                \n\t"\
394
                "paddw %%mm7, %%mm3                \n\t"\
395
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
396
                "packuswb %%mm0, %%mm2                \n\t"\
397
                "packuswb %%mm6, %%mm5                \n\t"\
398
                "packuswb %%mm3, %%mm4                \n\t"\
399
                "pxor %%mm7, %%mm7                \n\t"
400

    
401
#define WRITEBGR32 \
402
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
403
                        "movq %%mm2, %%mm1                \n\t" /* B */\
404
                        "movq %%mm5, %%mm6                \n\t" /* R */\
405
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
406
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
407
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
408
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
409
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
410
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
411
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
412
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
413
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
414
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
415
\
416
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
417
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
418
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
419
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
420
\
421
                        "addl $8, %%eax                        \n\t"\
422
                        "cmpl %5, %%eax                        \n\t"\
423
                        " jb 1b                                \n\t"
424

    
425
#define WRITEBGR16 \
426
                        "pand bF8, %%mm2                \n\t" /* B */\
427
                        "pand bFC, %%mm4                \n\t" /* G */\
428
                        "pand bF8, %%mm5                \n\t" /* R */\
429
                        "psrlq $3, %%mm2                \n\t"\
430
\
431
                        "movq %%mm2, %%mm1                \n\t"\
432
                        "movq %%mm4, %%mm3                \n\t"\
433
\
434
                        "punpcklbw %%mm7, %%mm3                \n\t"\
435
                        "punpcklbw %%mm5, %%mm2                \n\t"\
436
                        "punpckhbw %%mm7, %%mm4                \n\t"\
437
                        "punpckhbw %%mm5, %%mm1                \n\t"\
438
\
439
                        "psllq $3, %%mm3                \n\t"\
440
                        "psllq $3, %%mm4                \n\t"\
441
\
442
                        "por %%mm3, %%mm2                \n\t"\
443
                        "por %%mm4, %%mm1                \n\t"\
444
\
445
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
446
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
447
\
448
                        "addl $8, %%eax                        \n\t"\
449
                        "cmpl %5, %%eax                        \n\t"\
450
                        " jb 1b                                \n\t"
451

    
452
#define WRITEBGR15 \
453
                        "pand bF8, %%mm2                \n\t" /* B */\
454
                        "pand bF8, %%mm4                \n\t" /* G */\
455
                        "pand bF8, %%mm5                \n\t" /* R */\
456
                        "psrlq $3, %%mm2                \n\t"\
457
                        "psrlq $1, %%mm5                \n\t"\
458
\
459
                        "movq %%mm2, %%mm1                \n\t"\
460
                        "movq %%mm4, %%mm3                \n\t"\
461
\
462
                        "punpcklbw %%mm7, %%mm3                \n\t"\
463
                        "punpcklbw %%mm5, %%mm2                \n\t"\
464
                        "punpckhbw %%mm7, %%mm4                \n\t"\
465
                        "punpckhbw %%mm5, %%mm1                \n\t"\
466
\
467
                        "psllq $2, %%mm3                \n\t"\
468
                        "psllq $2, %%mm4                \n\t"\
469
\
470
                        "por %%mm3, %%mm2                \n\t"\
471
                        "por %%mm4, %%mm1                \n\t"\
472
\
473
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
474
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
475
\
476
                        "addl $8, %%eax                        \n\t"\
477
                        "cmpl %5, %%eax                        \n\t"\
478
                        " jb 1b                                \n\t"
479

    
480
#define WRITEBGR24OLD \
481
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
482
                        "movq %%mm2, %%mm1                \n\t" /* B */\
483
                        "movq %%mm5, %%mm6                \n\t" /* R */\
484
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
485
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
486
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
487
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
488
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
489
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
490
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
491
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
492
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
493
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
494
\
495
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
496
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
497
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 0 */\
498
                        "pand bm11111000, %%mm0                \n\t" /* 00RGB000 0.5 */\
499
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
500
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
501
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
502
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
503
\
504
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
505
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
506
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
507
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
508
                        "pand bm00001111, %%mm2                \n\t" /* 0000RGBR 1 */\
509
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
510
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
511
                        "pand bm00000111, %%mm4                \n\t" /* 00000RGB 2 */\
512
                        "pand bm11111000, %%mm1                \n\t" /* 00RGB000 2.5 */\
513
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
514
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
515
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
516
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
517
\
518
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
519
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
520
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
521
                        "pand bm00000111, %%mm5                \n\t" /* 00000RGB 3 */\
522
                        "pand bm11111000, %%mm3                \n\t" /* 00RGB000 3.5 */\
523
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
524
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
525
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
526
\
527
                        MOVNTQ(%%mm0, (%%ebx))\
528
                        MOVNTQ(%%mm2, 8(%%ebx))\
529
                        MOVNTQ(%%mm3, 16(%%ebx))\
530
                        "addl $24, %%ebx                \n\t"\
531
\
532
                        "addl $8, %%eax                        \n\t"\
533
                        "cmpl %5, %%eax                        \n\t"\
534
                        " jb 1b                                \n\t"
535

    
536
#define WRITEBGR24MMX \
537
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
538
                        "movq %%mm2, %%mm1                \n\t" /* B */\
539
                        "movq %%mm5, %%mm6                \n\t" /* R */\
540
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
541
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
542
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
543
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
544
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
545
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
546
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
547
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
548
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
549
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
550
\
551
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
552
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
553
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
554
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
555
\
556
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
557
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
558
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
559
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
560
\
561
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
562
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
563
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
564
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
565
\
566
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
567
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
568
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
569
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
570
                        MOVNTQ(%%mm0, (%%ebx))\
571
\
572
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
573
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
574
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
575
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
576
                        MOVNTQ(%%mm6, 8(%%ebx))\
577
\
578
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
579
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
580
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
581
                        MOVNTQ(%%mm5, 16(%%ebx))\
582
\
583
                        "addl $24, %%ebx                \n\t"\
584
\
585
                        "addl $8, %%eax                        \n\t"\
586
                        "cmpl %5, %%eax                        \n\t"\
587
                        " jb 1b                                \n\t"
588

    
589
#define WRITEBGR24MMX2 \
590
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
591
                        "movq M24A, %%mm0                \n\t"\
592
                        "movq M24C, %%mm7                \n\t"\
593
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
594
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
595
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
596
\
597
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
598
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
599
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
600
\
601
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
602
                        "por %%mm1, %%mm6                \n\t"\
603
                        "por %%mm3, %%mm6                \n\t"\
604
                        MOVNTQ(%%mm6, (%%ebx))\
605
\
606
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
607
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
608
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
609
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
610
\
611
                        "pand M24B, %%mm1                \n\t" /* B5       B4        B3    */\
612
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
613
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
614
\
615
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
616
                        "por %%mm3, %%mm6                \n\t"\
617
                        MOVNTQ(%%mm6, 8(%%ebx))\
618
\
619
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
620
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
621
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
622
\
623
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
624
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
625
                        "pand M24B, %%mm6                \n\t" /* R7       R6        R5    */\
626
\
627
                        "por %%mm1, %%mm3                \n\t"\
628
                        "por %%mm3, %%mm6                \n\t"\
629
                        MOVNTQ(%%mm6, 16(%%ebx))\
630
\
631
                        "addl $24, %%ebx                \n\t"\
632
\
633
                        "addl $8, %%eax                        \n\t"\
634
                        "cmpl %5, %%eax                        \n\t"\
635
                        " jb 1b                                \n\t"
636

    
637
#ifdef HAVE_MMX2
638
#undef WRITEBGR24
639
#define WRITEBGR24 WRITEBGR24MMX2
640
#else
641
#undef WRITEBGR24
642
#define WRITEBGR24 WRITEBGR24MMX
643
#endif
644

    
645
static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
646
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
647
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
648
                                    int16_t * lumMmxFilter, int16_t * chrMmxFilter)
649
{
650
#ifdef HAVE_MMX
651
        if(uDest != NULL)
652
        {
653
                asm volatile(
654
                                YSCALEYUV2YV12X(0)
655
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
656
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
657
                                : "%eax", "%edx", "%esi"
658
                        );
659

    
660
                asm volatile(
661
                                YSCALEYUV2YV12X(4096)
662
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
663
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
664
                                : "%eax", "%edx", "%esi"
665
                        );
666
        }
667

    
668
        asm volatile(
669
                        YSCALEYUV2YV12X(0)
670
                        :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
671
                           "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
672
                        : "%eax", "%edx", "%esi"
673
                );
674
#else
675
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
676
            chrFilter, chrSrc, chrFilterSize,
677
            dest, uDest, vDest, dstW);
678
#endif
679
}
680

    
681
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
682
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
683
{
684
#ifdef HAVE_MMX
685
        if(uDest != NULL)
686
        {
687
                asm volatile(
688
                                YSCALEYUV2YV121
689
                                :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
690
                                "g" (-(dstW>>1))
691
                                : "%eax"
692
                        );
693

    
694
                asm volatile(
695
                                YSCALEYUV2YV121
696
                                :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
697
                                "g" (-(dstW>>1))
698
                                : "%eax"
699
                        );
700
        }
701

    
702
        asm volatile(
703
                YSCALEYUV2YV121
704
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
705
                "g" (-dstW)
706
                : "%eax"
707
        );
708
#else
709
        //FIXME Optimize (just quickly writen not opti..)
710
        //FIXME replace MINMAX with LUTs
711
        int i;
712
        for(i=0; i<dstW; i++)
713
        {
714
                int val= lumSrc[i]>>7;
715

    
716
                dest[i]= MIN(MAX(val>>19, 0), 255);
717
        }
718

    
719
        if(uDest != NULL)
720
                for(i=0; i<(dstW>>1); i++)
721
                {
722
                        int u=chrSrc[i]>>7;
723
                        int v=chrSrc[i + 2048]>>7;
724

    
725
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
726
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
727
                }
728
#endif
729
}
730

    
731

    
732
/**
733
 * vertical scale YV12 to RGB
734
 */
735
static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
736
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
737
                            uint8_t *dest, int dstW, int dstbpp, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
738
{
739
        if(fullUVIpol)
740
        {
741
//FIXME
742
        }//FULL_UV_IPOL
743
        else
744
        {
745
#ifdef HAVE_MMX
746
                if(dstbpp == 32) //FIXME untested
747
                {
748
                        asm volatile(
749
                                YSCALEYUV2RGBX
750
                                WRITEBGR32
751

    
752
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
753
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
754
                           "r" (dest), "m" (dstW),
755
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
756
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
757
                        );
758
                }
759
                else if(dstbpp==24) //FIXME untested
760
                {
761
                        asm volatile(
762
                                YSCALEYUV2RGBX
763
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t" //FIXME optimize
764
                                "addl %4, %%ebx                        \n\t"
765
                                WRITEBGR24
766

    
767
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
768
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
769
                           "r" (dest), "m" (dstW),
770
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
771
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
772
                        );
773
                }
774
                else if(dstbpp==15)
775
                {
776
                        asm volatile(
777
                                YSCALEYUV2RGBX
778
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
779
#ifdef DITHER1XBPP
780
                                "paddusb b5Dither, %%mm2        \n\t"
781
                                "paddusb g5Dither, %%mm4        \n\t"
782
                                "paddusb r5Dither, %%mm5        \n\t"
783
#endif
784

    
785
                                WRITEBGR15
786

    
787
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
788
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
789
                           "r" (dest), "m" (dstW),
790
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
791
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
792
                        );
793
                }
794
                else if(dstbpp==16)
795
                {
796
                        asm volatile(
797
                                YSCALEYUV2RGBX
798
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
799
#ifdef DITHER1XBPP
800
                                "paddusb b5Dither, %%mm2        \n\t"
801
                                "paddusb g6Dither, %%mm4        \n\t"
802
                                "paddusb r5Dither, %%mm5        \n\t"
803
#endif
804

    
805
                                WRITEBGR16
806

    
807
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
808
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
809
                           "r" (dest), "m" (dstW),
810
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
811
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
812
                        );
813
                }
814
#else
815
yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
816
            chrFilter, chrSrc, chrFilterSize,
817
            dest, dstW, dstbpp);
818

    
819
#endif
820
        } //!FULL_UV_IPOL
821
}
822

    
823

    
824
/**
825
 * vertical bilinear scale YV12 to RGB
826
 */
827
static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
828
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstbpp)
829
{
830
        int yalpha1=yalpha^4095;
831
        int uvalpha1=uvalpha^4095;
832

    
833
        if(fullUVIpol)
834
        {
835

    
836
#ifdef HAVE_MMX
837
                if(dstbpp == 32)
838
                {
839
                        asm volatile(
840

    
841

    
842
FULL_YSCALEYUV2RGB
843
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
844
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
845

    
846
                        "movq %%mm3, %%mm1                \n\t"
847
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
848
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
849

    
850
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
851
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
852

    
853
                        "addl $4, %%eax                        \n\t"
854
                        "cmpl %5, %%eax                        \n\t"
855
                        " jb 1b                                \n\t"
856

    
857

    
858
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
859
                        "m" (yalpha1), "m" (uvalpha1)
860
                        : "%eax"
861
                        );
862
                }
863
                else if(dstbpp==24)
864
                {
865
                        asm volatile(
866

    
867
FULL_YSCALEYUV2RGB
868

    
869
                                                                // lsb ... msb
870
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
871
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
872

    
873
                        "movq %%mm3, %%mm1                \n\t"
874
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
875
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
876

    
877
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
878
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
879
                        "pand bm00000111, %%mm2                \n\t" // BGR00000
880
                        "pand bm11111000, %%mm3                \n\t" // 000BGR00
881
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
882
                        "movq %%mm1, %%mm2                \n\t"
883
                        "psllq $48, %%mm1                \n\t" // 000000BG
884
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
885

    
886
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
887
                        "psrld $16, %%mm2                \n\t" // R000R000
888
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
889
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
890

    
891
                        "movl %4, %%ebx                        \n\t"
892
                        "addl %%eax, %%ebx                \n\t"
893

    
894
#ifdef HAVE_MMX2
895
                        //FIXME Alignment
896
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
897
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
898
#else
899
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
900
                        "psrlq $32, %%mm3                \n\t"
901
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
902
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
903
#endif
904
                        "addl $4, %%eax                        \n\t"
905
                        "cmpl %5, %%eax                        \n\t"
906
                        " jb 1b                                \n\t"
907

    
908
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
909
                        "m" (yalpha1), "m" (uvalpha1)
910
                        : "%eax", "%ebx"
911
                        );
912
                }
913
                else if(dstbpp==15)
914
                {
915
                        asm volatile(
916

    
917
FULL_YSCALEYUV2RGB
918
#ifdef DITHER1XBPP
919
                        "paddusb g5Dither, %%mm1        \n\t"
920
                        "paddusb r5Dither, %%mm0        \n\t"
921
                        "paddusb b5Dither, %%mm3        \n\t"
922
#endif
923
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
924
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
925
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
926

    
927
                        "psrlw $3, %%mm3                \n\t"
928
                        "psllw $2, %%mm1                \n\t"
929
                        "psllw $7, %%mm0                \n\t"
930
                        "pand g15Mask, %%mm1                \n\t"
931
                        "pand r15Mask, %%mm0                \n\t"
932

    
933
                        "por %%mm3, %%mm1                \n\t"
934
                        "por %%mm1, %%mm0                \n\t"
935

    
936
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
937

    
938
                        "addl $4, %%eax                        \n\t"
939
                        "cmpl %5, %%eax                        \n\t"
940
                        " jb 1b                                \n\t"
941

    
942
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
943
                        "m" (yalpha1), "m" (uvalpha1)
944
                        : "%eax"
945
                        );
946
                }
947
                else if(dstbpp==16)
948
                {
949
                        asm volatile(
950

    
951
FULL_YSCALEYUV2RGB
952
#ifdef DITHER1XBPP
953
                        "paddusb g6Dither, %%mm1        \n\t"
954
                        "paddusb r5Dither, %%mm0        \n\t"
955
                        "paddusb b5Dither, %%mm3        \n\t"
956
#endif
957
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
958
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
959
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
960

    
961
                        "psrlw $3, %%mm3                \n\t"
962
                        "psllw $3, %%mm1                \n\t"
963
                        "psllw $8, %%mm0                \n\t"
964
                        "pand g16Mask, %%mm1                \n\t"
965
                        "pand r16Mask, %%mm0                \n\t"
966

    
967
                        "por %%mm3, %%mm1                \n\t"
968
                        "por %%mm1, %%mm0                \n\t"
969

    
970
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
971

    
972
                        "addl $4, %%eax                        \n\t"
973
                        "cmpl %5, %%eax                        \n\t"
974
                        " jb 1b                                \n\t"
975

    
976
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
977
                        "m" (yalpha1), "m" (uvalpha1)
978
                        : "%eax"
979
                        );
980
                }
981
#else
982
                if(dstbpp==32 || dstbpp==24)
983
                {
984
                        int i;
985
                        for(i=0;i<dstW;i++){
986
                                // vertical linear interpolation && yuv2rgb in a single step:
987
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
988
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
989
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
990
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
991
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
992
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
993
                                dest+=dstbpp>>3;
994
                        }
995
                }
996
                else if(dstbpp==16)
997
                {
998
                        int i;
999
                        for(i=0;i<dstW;i++){
1000
                                // vertical linear interpolation && yuv2rgb in a single step:
1001
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004

    
1005
                                ((uint16_t*)dest)[i] =
1006
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1007
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1008
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1009
                        }
1010
                }
1011
                else if(dstbpp==15)
1012
                {
1013
                        int i;
1014
                        for(i=0;i<dstW;i++){
1015
                                // vertical linear interpolation && yuv2rgb in a single step:
1016
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1017
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1018
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1019

    
1020
                                ((uint16_t*)dest)[i] =
1021
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1022
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1023
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1024
                        }
1025
                }
1026
#endif
1027
        }//FULL_UV_IPOL
1028
        else
1029
        {
1030
#ifdef HAVE_MMX
1031
                if(dstbpp == 32)
1032
                {
1033
                        asm volatile(
1034
                                YSCALEYUV2RGB
1035
                                WRITEBGR32
1036

    
1037
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1038
                        "m" (yalpha1), "m" (uvalpha1)
1039
                        : "%eax"
1040
                        );
1041
                }
1042
                else if(dstbpp==24)
1043
                {
1044
                        asm volatile(
1045
                                "movl %4, %%ebx                        \n\t"
1046
                                YSCALEYUV2RGB
1047
                                WRITEBGR24
1048

    
1049
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1050
                        "m" (yalpha1), "m" (uvalpha1)
1051
                        : "%eax", "%ebx"
1052
                        );
1053
                }
1054
                else if(dstbpp==15)
1055
                {
1056
                        asm volatile(
1057
                                YSCALEYUV2RGB
1058
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1059
#ifdef DITHER1XBPP
1060
                                "paddusb b5Dither, %%mm2        \n\t"
1061
                                "paddusb g5Dither, %%mm4        \n\t"
1062
                                "paddusb r5Dither, %%mm5        \n\t"
1063
#endif
1064

    
1065
                                WRITEBGR15
1066

    
1067
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1068
                        "m" (yalpha1), "m" (uvalpha1)
1069
                        : "%eax"
1070
                        );
1071
                }
1072
                else if(dstbpp==16)
1073
                {
1074
                        asm volatile(
1075
                                YSCALEYUV2RGB
1076
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077
#ifdef DITHER1XBPP
1078
                                "paddusb b5Dither, %%mm2        \n\t"
1079
                                "paddusb g6Dither, %%mm4        \n\t"
1080
                                "paddusb r5Dither, %%mm5        \n\t"
1081
#endif
1082

    
1083
                                WRITEBGR16
1084

    
1085
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1086
                        "m" (yalpha1), "m" (uvalpha1)
1087
                        : "%eax"
1088
                        );
1089
                }
1090
#else
1091
                if(dstbpp==32)
1092
                {
1093
                        int i;
1094
                        for(i=0; i<dstW-1; i+=2){
1095
                                // vertical linear interpolation && yuv2rgb in a single step:
1096
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1097
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1098
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1099
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1100

    
1101
                                int Cb= yuvtab_40cf[U];
1102
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1103
                                int Cr= yuvtab_3343[V];
1104

    
1105
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1106
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1107
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1108

    
1109
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1110
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1111
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1112
                        }
1113
                }
1114
                else if(dstbpp==24)
1115
                {
1116
                        int i;
1117
                        for(i=0; i<dstW-1; i+=2){
1118
                                // vertical linear interpolation && yuv2rgb in a single step:
1119
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1120
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1121
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1122
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1123

    
1124
                                int Cb= yuvtab_40cf[U];
1125
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1126
                                int Cr= yuvtab_3343[V];
1127

    
1128
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
1129
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
1130
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
1131

    
1132
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
1133
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
1134
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
1135
                                dest+=6;
1136
                        }
1137
                }
1138
                else if(dstbpp==16)
1139
                {
1140
                        int i;
1141
                        for(i=0; i<dstW-1; i+=2){
1142
                                // vertical linear interpolation && yuv2rgb in a single step:
1143
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1144
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1145
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1146
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1147

    
1148
                                int Cb= yuvtab_40cf[U];
1149
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1150
                                int Cr= yuvtab_3343[V];
1151

    
1152
                                ((uint16_t*)dest)[i] =
1153
                                        clip_table16b[(Y1 + Cb) >>13] |
1154
                                        clip_table16g[(Y1 + Cg) >>13] |
1155
                                        clip_table16r[(Y1 + Cr) >>13];
1156

    
1157
                                ((uint16_t*)dest)[i+1] =
1158
                                        clip_table16b[(Y2 + Cb) >>13] |
1159
                                        clip_table16g[(Y2 + Cg) >>13] |
1160
                                        clip_table16r[(Y2 + Cr) >>13];
1161
                        }
1162
                }
1163
                else if(dstbpp==15)
1164
                {
1165
                        int i;
1166
                        for(i=0; i<dstW-1; i+=2){
1167
                                // vertical linear interpolation && yuv2rgb in a single step:
1168
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1169
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1170
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1171
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1172

    
1173
                                int Cb= yuvtab_40cf[U];
1174
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1175
                                int Cr= yuvtab_3343[V];
1176

    
1177
                                ((uint16_t*)dest)[i] =
1178
                                        clip_table15b[(Y1 + Cb) >>13] |
1179
                                        clip_table15g[(Y1 + Cg) >>13] |
1180
                                        clip_table15r[(Y1 + Cr) >>13];
1181

    
1182
                                ((uint16_t*)dest)[i+1] =
1183
                                        clip_table15b[(Y2 + Cb) >>13] |
1184
                                        clip_table15g[(Y2 + Cg) >>13] |
1185
                                        clip_table15r[(Y2 + Cr) >>13];
1186
                        }
1187
                }
1188
#endif
1189
        } //!FULL_UV_IPOL
1190
}
1191

    
1192
/**
1193
 * YV12 to RGB without scaling or interpolating
1194
 */
1195
static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1196
                            uint8_t *dest, int dstW, int uvalpha, int dstbpp)
1197
{
1198
        int uvalpha1=uvalpha^4095;
1199
        const int yalpha1=0;
1200

    
1201
        if(fullUVIpol || allwaysIpol)
1202
        {
1203
                RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstbpp);
1204
                return;
1205
        }
1206

    
1207
#ifdef HAVE_MMX
1208
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1209
        {
1210
                if(dstbpp == 32)
1211
                {
1212
                        asm volatile(
1213
                                YSCALEYUV2RGB1
1214
                                WRITEBGR32
1215
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1216
                        "m" (yalpha1), "m" (uvalpha1)
1217
                        : "%eax"
1218
                        );
1219
                }
1220
                else if(dstbpp==24)
1221
                {
1222
                        asm volatile(
1223
                                "movl %4, %%ebx                        \n\t"
1224
                                YSCALEYUV2RGB1
1225
                                WRITEBGR24
1226
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1227
                        "m" (yalpha1), "m" (uvalpha1)
1228
                        : "%eax", "%ebx"
1229
                        );
1230
                }
1231
                else if(dstbpp==15)
1232
                {
1233
                        asm volatile(
1234
                                YSCALEYUV2RGB1
1235
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1236
#ifdef DITHER1XBPP
1237
                                "paddusb b5Dither, %%mm2        \n\t"
1238
                                "paddusb g5Dither, %%mm4        \n\t"
1239
                                "paddusb r5Dither, %%mm5        \n\t"
1240
#endif
1241
                                WRITEBGR15
1242
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1243
                        "m" (yalpha1), "m" (uvalpha1)
1244
                        : "%eax"
1245
                        );
1246
                }
1247
                else if(dstbpp==16)
1248
                {
1249
                        asm volatile(
1250
                                YSCALEYUV2RGB1
1251
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1252
#ifdef DITHER1XBPP
1253
                                "paddusb b5Dither, %%mm2        \n\t"
1254
                                "paddusb g6Dither, %%mm4        \n\t"
1255
                                "paddusb r5Dither, %%mm5        \n\t"
1256
#endif
1257

    
1258
                                WRITEBGR16
1259
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1260
                        "m" (yalpha1), "m" (uvalpha1)
1261
                        : "%eax"
1262
                        );
1263
                }
1264
        }
1265
        else
1266
        {
1267
                if(dstbpp == 32)
1268
                {
1269
                        asm volatile(
1270
                                YSCALEYUV2RGB1b
1271
                                WRITEBGR32
1272
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1273
                        "m" (yalpha1), "m" (uvalpha1)
1274
                        : "%eax"
1275
                        );
1276
                }
1277
                else if(dstbpp==24)
1278
                {
1279
                        asm volatile(
1280
                                "movl %4, %%ebx                        \n\t"
1281
                                YSCALEYUV2RGB1b
1282
                                WRITEBGR24
1283
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1284
                        "m" (yalpha1), "m" (uvalpha1)
1285
                        : "%eax", "%ebx"
1286
                        );
1287
                }
1288
                else if(dstbpp==15)
1289
                {
1290
                        asm volatile(
1291
                                YSCALEYUV2RGB1b
1292
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1293
#ifdef DITHER1XBPP
1294
                                "paddusb b5Dither, %%mm2        \n\t"
1295
                                "paddusb g5Dither, %%mm4        \n\t"
1296
                                "paddusb r5Dither, %%mm5        \n\t"
1297
#endif
1298
                                WRITEBGR15
1299
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1300
                        "m" (yalpha1), "m" (uvalpha1)
1301
                        : "%eax"
1302
                        );
1303
                }
1304
                else if(dstbpp==16)
1305
                {
1306
                        asm volatile(
1307
                                YSCALEYUV2RGB1b
1308
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309
#ifdef DITHER1XBPP
1310
                                "paddusb b5Dither, %%mm2        \n\t"
1311
                                "paddusb g6Dither, %%mm4        \n\t"
1312
                                "paddusb r5Dither, %%mm5        \n\t"
1313
#endif
1314

    
1315
                                WRITEBGR16
1316
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1317
                        "m" (yalpha1), "m" (uvalpha1)
1318
                        : "%eax"
1319
                        );
1320
                }
1321
        }
1322
#else
1323
//FIXME write 2 versions (for even & odd lines)
1324

    
1325
        if(dstbpp==32)
1326
        {
1327
                int i;
1328
                for(i=0; i<dstW-1; i+=2){
1329
                        // vertical linear interpolation && yuv2rgb in a single step:
1330
                        int Y1=yuvtab_2568[buf0[i]>>7];
1331
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1332
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1333
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1334

    
1335
                        int Cb= yuvtab_40cf[U];
1336
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1337
                        int Cr= yuvtab_3343[V];
1338

    
1339
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1340
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1341
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1342

    
1343
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1344
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1345
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1346
                }
1347
        }
1348
        else if(dstbpp==24)
1349
        {
1350
                int i;
1351
                for(i=0; i<dstW-1; i+=2){
1352
                        // vertical linear interpolation && yuv2rgb in a single step:
1353
                        int Y1=yuvtab_2568[buf0[i]>>7];
1354
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1355
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1356
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1357

    
1358
                        int Cb= yuvtab_40cf[U];
1359
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1360
                        int Cr= yuvtab_3343[V];
1361

    
1362
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1363
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1364
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1365

    
1366
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1367
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1368
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1369
                        dest+=6;
1370
                }
1371
        }
1372
        else if(dstbpp==16)
1373
        {
1374
                int i;
1375
                for(i=0; i<dstW-1; i+=2){
1376
                        // vertical linear interpolation && yuv2rgb in a single step:
1377
                        int Y1=yuvtab_2568[buf0[i]>>7];
1378
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1379
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1380
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1381

    
1382
                        int Cb= yuvtab_40cf[U];
1383
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1384
                        int Cr= yuvtab_3343[V];
1385

    
1386
                        ((uint16_t*)dest)[i] =
1387
                                clip_table16b[(Y1 + Cb) >>13] |
1388
                                clip_table16g[(Y1 + Cg) >>13] |
1389
                                clip_table16r[(Y1 + Cr) >>13];
1390

    
1391
                        ((uint16_t*)dest)[i+1] =
1392
                                clip_table16b[(Y2 + Cb) >>13] |
1393
                                clip_table16g[(Y2 + Cg) >>13] |
1394
                                clip_table16r[(Y2 + Cr) >>13];
1395
                }
1396
        }
1397
        else if(dstbpp==15)
1398
        {
1399
                int i;
1400
                for(i=0; i<dstW-1; i+=2){
1401
                        // vertical linear interpolation && yuv2rgb in a single step:
1402
                        int Y1=yuvtab_2568[buf0[i]>>7];
1403
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1404
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1405
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1406

    
1407
                        int Cb= yuvtab_40cf[U];
1408
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1409
                        int Cr= yuvtab_3343[V];
1410

    
1411
                        ((uint16_t*)dest)[i] =
1412
                                clip_table15b[(Y1 + Cb) >>13] |
1413
                                clip_table15g[(Y1 + Cg) >>13] |
1414
                                clip_table15r[(Y1 + Cr) >>13];
1415

    
1416
                        ((uint16_t*)dest)[i+1] =
1417
                                clip_table15b[(Y2 + Cb) >>13] |
1418
                                clip_table15g[(Y2 + Cg) >>13] |
1419
                                clip_table15r[(Y2 + Cr) >>13];
1420
                }
1421
        }
1422
#endif
1423
}
1424

    
1425
// Bilinear / Bicubic scaling
1426
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1427
                                  int16_t *filter, int16_t *filterPos, int filterSize)
1428
{
1429
#ifdef HAVE_MMX
1430
        if(filterSize==4) // allways true for upscaling, sometimes for down too
1431
        {
1432
                int counter= -2*dstW;
1433
                filter-= counter*2;
1434
                filterPos-= counter/2;
1435
                dst-= counter/2;
1436
                asm volatile(
1437
                        "pxor %%mm7, %%mm7                \n\t"
1438
                        "movq w02, %%mm6                \n\t"
1439
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
1440
                        "movl %%eax, %%ebp                \n\t"
1441
                        ".balign 16                        \n\t"
1442
                        "1:                                \n\t"
1443
                        "movzwl (%2, %%ebp), %%eax        \n\t"
1444
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
1445
                        "movq (%1, %%ebp, 4), %%mm1        \n\t"
1446
                        "movq 8(%1, %%ebp, 4), %%mm3        \n\t"
1447
                        "movd (%3, %%eax), %%mm0        \n\t"
1448
                        "movd (%3, %%ebx), %%mm2        \n\t"
1449
                        "punpcklbw %%mm7, %%mm0                \n\t"
1450
                        "punpcklbw %%mm7, %%mm2                \n\t"
1451
                        "pmaddwd %%mm1, %%mm0                \n\t"
1452
                        "pmaddwd %%mm2, %%mm3                \n\t"
1453
                        "psrad $8, %%mm0                \n\t"
1454
                        "psrad $8, %%mm3                \n\t"
1455
                        "packssdw %%mm3, %%mm0                \n\t"
1456
                        "pmaddwd %%mm6, %%mm0                \n\t"
1457
                        "packssdw %%mm0, %%mm0                \n\t"
1458
                        "movd %%mm0, (%4, %%ebp)        \n\t"
1459
                        "addl $4, %%ebp                        \n\t"
1460
                        " jnc 1b                        \n\t"
1461

    
1462
                        "popl %%ebp                        \n\t"
1463
                        : "+a" (counter)
1464
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1465
                        : "%ebx"
1466
                );
1467
        }
1468
        else if(filterSize==8)
1469
        {
1470
                int counter= -2*dstW;
1471
                filter-= counter*4;
1472
                filterPos-= counter/2;
1473
                dst-= counter/2;
1474
                asm volatile(
1475
                        "pxor %%mm7, %%mm7                \n\t"
1476
                        "movq w02, %%mm6                \n\t"
1477
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
1478
                        "movl %%eax, %%ebp                \n\t"
1479
                        ".balign 16                        \n\t"
1480
                        "1:                                \n\t"
1481
                        "movzwl (%2, %%ebp), %%eax        \n\t"
1482
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
1483
                        "movq (%1, %%ebp, 8), %%mm1        \n\t"
1484
                        "movq 16(%1, %%ebp, 8), %%mm3        \n\t"
1485
                        "movd (%3, %%eax), %%mm0        \n\t"
1486
                        "movd (%3, %%ebx), %%mm2        \n\t"
1487
                        "punpcklbw %%mm7, %%mm0                \n\t"
1488
                        "punpcklbw %%mm7, %%mm2                \n\t"
1489
                        "pmaddwd %%mm1, %%mm0                \n\t"
1490
                        "pmaddwd %%mm2, %%mm3                \n\t"
1491

    
1492
                        "movq 8(%1, %%ebp, 8), %%mm1        \n\t"
1493
                        "movq 24(%1, %%ebp, 8), %%mm5        \n\t"
1494
                        "movd 4(%3, %%eax), %%mm4        \n\t"
1495
                        "movd 4(%3, %%ebx), %%mm2        \n\t"
1496
                        "punpcklbw %%mm7, %%mm4                \n\t"
1497
                        "punpcklbw %%mm7, %%mm2                \n\t"
1498
                        "pmaddwd %%mm1, %%mm4                \n\t"
1499
                        "pmaddwd %%mm2, %%mm5                \n\t"
1500
                        "paddd %%mm4, %%mm0                \n\t"
1501
                        "paddd %%mm5, %%mm3                \n\t"
1502
                                                
1503
                        "psrad $8, %%mm0                \n\t"
1504
                        "psrad $8, %%mm3                \n\t"
1505
                        "packssdw %%mm3, %%mm0                \n\t"
1506
                        "pmaddwd %%mm6, %%mm0                \n\t"
1507
                        "packssdw %%mm0, %%mm0                \n\t"
1508
                        "movd %%mm0, (%4, %%ebp)        \n\t"
1509
                        "addl $4, %%ebp                        \n\t"
1510
                        " jnc 1b                        \n\t"
1511

    
1512
                        "popl %%ebp                        \n\t"
1513
                        : "+a" (counter)
1514
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1515
                        : "%ebx"
1516
                );
1517
        }
1518
        else
1519
        {
1520
                int counter= -2*dstW;
1521
//                filter-= counter*filterSize/2;
1522
                filterPos-= counter/2;
1523
                dst-= counter/2;
1524
                asm volatile(
1525
                        "pxor %%mm7, %%mm7                \n\t"
1526
                        "movq w02, %%mm6                \n\t"
1527
                        ".balign 16                        \n\t"
1528
                        "1:                                \n\t"
1529
                        "movl %2, %%ecx                        \n\t"
1530
                        "movzwl (%%ecx, %0), %%eax        \n\t"
1531
                        "movzwl 2(%%ecx, %0), %%ebx        \n\t"
1532
                        "movl %5, %%ecx                        \n\t"
1533
                        "pxor %%mm4, %%mm4                \n\t"
1534
                        "pxor %%mm5, %%mm5                \n\t"
1535
                        "2:                                \n\t"
1536
                        "movq (%1), %%mm1                \n\t"
1537
                        "movq (%1, %6), %%mm3                \n\t"
1538
                        "movd (%%ecx, %%eax), %%mm0        \n\t"
1539
                        "movd (%%ecx, %%ebx), %%mm2        \n\t"
1540
                        "punpcklbw %%mm7, %%mm0                \n\t"
1541
                        "punpcklbw %%mm7, %%mm2                \n\t"
1542
                        "pmaddwd %%mm1, %%mm0                \n\t"
1543
                        "pmaddwd %%mm2, %%mm3                \n\t"
1544
                        "paddd %%mm3, %%mm5                \n\t"
1545
                        "paddd %%mm0, %%mm4                \n\t"
1546
                        "addl $8, %1                        \n\t"
1547
                        "addl $4, %%ecx                        \n\t"
1548
                        "cmpl %4, %%ecx                        \n\t"
1549
                        " jb 2b                                \n\t"
1550
                        "addl %6, %1                        \n\t"
1551
                        "psrad $8, %%mm4                \n\t"
1552
                        "psrad $8, %%mm5                \n\t"
1553
                        "packssdw %%mm5, %%mm4                \n\t"
1554
                        "pmaddwd %%mm6, %%mm4                \n\t"
1555
                        "packssdw %%mm4, %%mm4                \n\t"
1556
                        "movl %3, %%eax                        \n\t"
1557
                        "movd %%mm4, (%%eax, %0)        \n\t"
1558
                        "addl $4, %0                        \n\t"
1559
                        " jnc 1b                        \n\t"
1560

    
1561
                        : "+r" (counter), "+r" (filter)
1562
                        : "m" (filterPos), "m" (dst), "m"(src+filterSize),
1563
                          "m" (src), "r" (filterSize*2)
1564
                        : "%ebx", "%eax", "%ecx"
1565
                );
1566
        }
1567
#else
1568
        int i;
1569
        for(i=0; i<dstW; i++)
1570
        {
1571
                int j;
1572
                int srcPos= filterPos[i];
1573
                int val=0;
1574
//                printf("filterPos: %d\n", filterPos[i]);
1575
                for(j=0; j<filterSize; j++)
1576
                {
1577
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
1578
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1579
                }
1580
//                filter += hFilterSize;
1581
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
1582
//                dst[i] = val>>7;
1583
        }
1584
#endif
1585
}
1586
      // *** horizontal scale Y line to temp buffer
1587
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc)
1588
{
1589
#ifdef HAVE_MMX
1590
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1591
    if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1592
#else
1593
    if(sws_flags != SWS_FAST_BILINEAR)
1594
#endif
1595
    {
1596
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
1597
    }
1598
    else // Fast Bilinear upscale / crap downscale
1599
    {
1600
#ifdef ARCH_X86
1601
#ifdef HAVE_MMX2
1602
        int i;
1603
        if(canMMX2BeUsed)
1604
        {
1605
                asm volatile(
1606
                        "pxor %%mm7, %%mm7                \n\t"
1607
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1608
                        "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1609
                        "punpcklwd %%mm6, %%mm6                \n\t"
1610
                        "punpcklwd %%mm6, %%mm6                \n\t"
1611
                        "movq %%mm6, %%mm2                \n\t"
1612
                        "psllq $16, %%mm2                \n\t"
1613
                        "paddw %%mm6, %%mm2                \n\t"
1614
                        "psllq $16, %%mm2                \n\t"
1615
                        "paddw %%mm6, %%mm2                \n\t"
1616
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFF
1617
                        "movq %%mm2, temp0                \n\t"
1618
                        "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1619
                        "punpcklwd %%mm6, %%mm6                \n\t"
1620
                        "punpcklwd %%mm6, %%mm6                \n\t"
1621
                        "xorl %%eax, %%eax                \n\t" // i
1622
                        "movl %0, %%esi                        \n\t" // src
1623
                        "movl %1, %%edi                        \n\t" // buf1
1624
                        "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1625
                        "xorl %%ecx, %%ecx                \n\t"
1626
                        "xorl %%ebx, %%ebx                \n\t"
1627
                        "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1628

    
1629
#define FUNNY_Y_CODE \
1630
                        PREFETCH" 1024(%%esi)                \n\t"\
1631
                        PREFETCH" 1056(%%esi)                \n\t"\
1632
                        PREFETCH" 1088(%%esi)                \n\t"\
1633
                        "call funnyYCode                \n\t"\
1634
                        "movq temp0, %%mm2                \n\t"\
1635
                        "xorl %%ecx, %%ecx                \n\t"
1636

    
1637
FUNNY_Y_CODE
1638
FUNNY_Y_CODE
1639
FUNNY_Y_CODE
1640
FUNNY_Y_CODE
1641
FUNNY_Y_CODE
1642
FUNNY_Y_CODE
1643
FUNNY_Y_CODE
1644
FUNNY_Y_CODE
1645

    
1646
                        :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1647
                        "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF)
1648
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1649
                );
1650
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
1651
        }
1652
        else
1653
        {
1654
#endif
1655
        //NO MMX just normal asm ...
1656
        asm volatile(
1657
                "xorl %%eax, %%eax                \n\t" // i
1658
                "xorl %%ebx, %%ebx                \n\t" // xx
1659
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1660
                ".balign 16                        \n\t"
1661
                "1:                                \n\t"
1662
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
1663
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1664
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1665
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1666
                "shll $16, %%edi                \n\t"
1667
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1668
                "movl %1, %%edi                        \n\t"
1669
                "shrl $9, %%esi                        \n\t"
1670
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1671
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1672
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1673

    
1674
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
1675
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
1676
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1677
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1678
                "shll $16, %%edi                \n\t"
1679
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1680
                "movl %1, %%edi                        \n\t"
1681
                "shrl $9, %%esi                        \n\t"
1682
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
1683
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1684
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1685

    
1686

    
1687
                "addl $2, %%eax                        \n\t"
1688
                "cmpl %2, %%eax                        \n\t"
1689
                " jb 1b                                \n\t"
1690

    
1691

    
1692
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
1693
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1694
                );
1695
#ifdef HAVE_MMX2
1696
        } //if MMX2 cant be used
1697
#endif
1698
#else
1699
        int i;
1700
        unsigned int xpos=0;
1701
        for(i=0;i<dstWidth;i++)
1702
        {
1703
                register unsigned int xx=xpos>>16;
1704
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
1705
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1706
                xpos+=xInc;
1707
        }
1708
#endif
1709
    }
1710
}
1711

    
1712
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth,
1713
                                uint8_t *src1, uint8_t *src2, int srcW, int xInc)
1714
{
1715
#ifdef HAVE_MMX
1716
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
1717
    if(sws_flags != SWS_FAST_BILINEAR || (!canMMX2BeUsed))
1718
#else
1719
    if(sws_flags != SWS_FAST_BILINEAR)
1720
#endif
1721
    {
1722
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1723
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
1724
    }
1725
    else // Fast Bilinear upscale / crap downscale
1726
    {
1727
#ifdef ARCH_X86
1728
#ifdef HAVE_MMX2
1729
        int i;
1730
        if(canMMX2BeUsed)
1731
        {
1732
                asm volatile(
1733
                "pxor %%mm7, %%mm7                \n\t"
1734
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
1735
                "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
1736
                "punpcklwd %%mm6, %%mm6                \n\t"
1737
                "punpcklwd %%mm6, %%mm6                \n\t"
1738
                "movq %%mm6, %%mm2                \n\t"
1739
                "psllq $16, %%mm2                \n\t"
1740
                "paddw %%mm6, %%mm2                \n\t"
1741
                "psllq $16, %%mm2                \n\t"
1742
                "paddw %%mm6, %%mm2                \n\t"
1743
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFFFF
1744
                "movq %%mm2, temp0                \n\t"
1745
                "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
1746
                "punpcklwd %%mm6, %%mm6                \n\t"
1747
                "punpcklwd %%mm6, %%mm6                \n\t"
1748
                "xorl %%eax, %%eax                \n\t" // i
1749
                "movl %0, %%esi                        \n\t" // src
1750
                "movl %1, %%edi                        \n\t" // buf1
1751
                "movl %3, %%edx                        \n\t" // (xInc*4)>>16
1752
                "xorl %%ecx, %%ecx                \n\t"
1753
                "xorl %%ebx, %%ebx                \n\t"
1754
                "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
1755

    
1756
#define FUNNYUVCODE \
1757
                        PREFETCH" 1024(%%esi)                \n\t"\
1758
                        PREFETCH" 1056(%%esi)                \n\t"\
1759
                        PREFETCH" 1088(%%esi)                \n\t"\
1760
                        "call funnyUVCode                \n\t"\
1761
                        "movq temp0, %%mm2                \n\t"\
1762
                        "xorl %%ecx, %%ecx                \n\t"
1763

    
1764
FUNNYUVCODE
1765
FUNNYUVCODE
1766
FUNNYUVCODE
1767
FUNNYUVCODE
1768

    
1769
FUNNYUVCODE
1770
FUNNYUVCODE
1771
FUNNYUVCODE
1772
FUNNYUVCODE
1773
                "xorl %%eax, %%eax                \n\t" // i
1774
                "movl %6, %%esi                        \n\t" // src
1775
                "movl %1, %%edi                        \n\t" // buf1
1776
                "addl $4096, %%edi                \n\t"
1777

    
1778
FUNNYUVCODE
1779
FUNNYUVCODE
1780
FUNNYUVCODE
1781
FUNNYUVCODE
1782

    
1783
FUNNYUVCODE
1784
FUNNYUVCODE
1785
FUNNYUVCODE
1786
FUNNYUVCODE
1787

    
1788
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
1789
                  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2)
1790
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
1791
        );
1792
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1793
                {
1794
//                        printf("%d %d %d\n", dstWidth, i, srcW);
1795
                        dst[i] = src1[srcW-1]*128;
1796
                        dst[i+2048] = src2[srcW-1]*128;
1797
                }
1798
        }
1799
        else
1800
        {
1801
#endif
1802
        asm volatile(
1803
                "xorl %%eax, %%eax                \n\t" // i
1804
                "xorl %%ebx, %%ebx                \n\t" // xx
1805
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
1806
                ".balign 16                        \n\t"
1807
                "1:                                \n\t"
1808
                "movl %0, %%esi                        \n\t"
1809
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
1810
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
1811
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1812
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1813
                "shll $16, %%edi                \n\t"
1814
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1815
                "movl %1, %%edi                        \n\t"
1816
                "shrl $9, %%esi                        \n\t"
1817
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
1818

    
1819
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
1820
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
1821
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
1822
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
1823
                "shll $16, %%edi                \n\t"
1824
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
1825
                "movl %1, %%edi                        \n\t"
1826
                "shrl $9, %%esi                        \n\t"
1827
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
1828

    
1829
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
1830
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
1831
                "addl $1, %%eax                        \n\t"
1832
                "cmpl %2, %%eax                        \n\t"
1833
                " jb 1b                                \n\t"
1834

    
1835
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
1836
                "r" (src2)
1837
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
1838
                );
1839
#ifdef HAVE_MMX2
1840
        } //if MMX2 cant be used
1841
#endif
1842
#else
1843
        int i;
1844
        unsigned int xpos=0;
1845
        for(i=0;i<dstWidth;i++)
1846
        {
1847
                register unsigned int xx=xpos>>16;
1848
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
1849
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1850
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1851
/* slower
1852
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
1853
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
1854
*/
1855
                xpos+=xInc;
1856
        }
1857
#endif
1858
   }
1859
}
1860

    
1861
static inline void RENAME(initFilter)(int16_t *dstFilter, int16_t *filterPos, int *filterSize, int xInc,
1862
                                      int srcW, int dstW, int filterAlign, int one)
1863
{
1864
        int i;
1865
        double filter[8000];
1866
#ifdef HAVE_MMX
1867
        asm volatile("emms\n\t"::: "memory"); //FIXME this shouldnt be required but it IS (even for non mmx versions)
1868
#endif
1869

    
1870
        if(ABS(xInc - 0x10000) <10) // unscaled
1871
        {
1872
                int i;
1873
                *filterSize= (1 +(filterAlign-1)) & (~(filterAlign-1)); // 1 or 4 normaly
1874
                for(i=0; i<dstW*(*filterSize); i++) filter[i]=0;
1875

    
1876
                for(i=0; i<dstW; i++)
1877
                {
1878
                        filter[i*(*filterSize)]=1;
1879
                        filterPos[i]=i;
1880
                }
1881

    
1882
        }
1883
        else if(xInc <= (1<<16) || sws_flags==SWS_FAST_BILINEAR) // upscale
1884
        {
1885
                int i;
1886
                int xDstInSrc;
1887
                if(sws_flags==SWS_BICUBIC) *filterSize= 4;
1888
                else                           *filterSize= 2;
1889
//                printf("%d %d %d\n", filterSize, srcW, dstW);
1890
                *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1891

    
1892
                xDstInSrc= xInc - 0x8000;
1893
                for(i=0; i<dstW; i++)
1894
                {
1895
                        int xx= (xDstInSrc>>16) - (*filterSize>>1) + 1;
1896
                        int j;
1897

    
1898
                        filterPos[i]= xx;
1899
                        if(sws_flags == SWS_BICUBIC)
1900
                        {
1901
                                double d= ABS(((xx+1)<<16) - xDstInSrc)/(double)(1<<16);
1902
                                double y1,y2,y3,y4;
1903
                                double A= -0.75;
1904
                                        // Equation is from VirtualDub
1905
                                y1 = (        +     A*d -       2.0*A*d*d +       A*d*d*d);
1906
                                y2 = (+ 1.0             -     (A+3.0)*d*d + (A+2.0)*d*d*d);
1907
                                y3 = (        -     A*d + (2.0*A+3.0)*d*d - (A+2.0)*d*d*d);
1908
                                y4 = (                  +           A*d*d -       A*d*d*d);
1909

    
1910
//                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1911
                                filter[i*(*filterSize) + 0]= y1;
1912
                                filter[i*(*filterSize) + 1]= y2;
1913
                                filter[i*(*filterSize) + 2]= y3;
1914
                                filter[i*(*filterSize) + 3]= y4;
1915
//                                printf("%1.3f %d, %d, %d, %d\n",d , y1, y2, y3, y4);
1916
                        }
1917
                        else
1918
                        {
1919
                                for(j=0; j<*filterSize; j++)
1920
                                {
1921
                                        double d= ABS((xx<<16) - xDstInSrc)/(double)(1<<16);
1922
                                        double coeff= 1.0 - d;
1923
                                        if(coeff<0) coeff=0;
1924
        //                                printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1925
                                        filter[i*(*filterSize) + j]= coeff;
1926
                                        xx++;
1927
                                }
1928
                        }
1929
                        xDstInSrc+= xInc;
1930
                }
1931
        }
1932
        else // downscale
1933
        {
1934
                int xDstInSrc;
1935
                if(sws_flags==SWS_BICUBIC) *filterSize= (int)ceil(1 + 4.0*srcW / (double)dstW);
1936
                else                           *filterSize= (int)ceil(1 + 2.0*srcW / (double)dstW);
1937
//                printf("%d %d %d\n", *filterSize, srcW, dstW);
1938
                *filterSize= (*filterSize +(filterAlign-1)) & (~(filterAlign-1));
1939

    
1940
                xDstInSrc= xInc - 0x8000;
1941
                for(i=0; i<dstW; i++)
1942
                {
1943
                        int xx= (int)((double)xDstInSrc/(double)(1<<16) - *filterSize*0.5 + 0.5);
1944
                        int j;
1945

    
1946
                        filterPos[i]= xx;
1947
                        for(j=0; j<*filterSize; j++)
1948
                        {
1949
                                double d= ABS((xx<<16) - xDstInSrc)/(double)xInc;
1950
                                double coeff;
1951
                                if(sws_flags == SWS_BICUBIC)
1952
                                {
1953
                                        double A= -0.75;
1954
//                                        d*=2;
1955
                                        // Equation is from VirtualDub
1956
                                        if(d<1.0)
1957
                                                coeff = (1.0 - (A+3.0)*d*d + (A+2.0)*d*d*d);
1958
                                        else if(d<2.0)
1959
                                                coeff = (-4.0*A + 8.0*A*d - 5.0*A*d*d + A*d*d*d);
1960
                                        else
1961
                                                coeff=0.0;
1962
                                }
1963
                                else
1964
                                {
1965
                                        coeff= 1.0 - d;
1966
                                        if(coeff<0) coeff=0;
1967
                                }
1968
//                                if(filterAlign==1) printf("%d %d %d \n", coeff, (int)d, xDstInSrc);
1969
                                filter[i*(*filterSize) + j]= coeff;
1970
                                xx++;
1971
                        }
1972
                        xDstInSrc+= xInc;
1973
                }
1974
        }
1975

    
1976
        //fix borders
1977
        for(i=0; i<dstW; i++)
1978
        {
1979
                int j;
1980
                if(filterPos[i] < 0)
1981
                {
1982
                        // Move filter coeffs left to compensate for filterPos
1983
                        for(j=1; j<*filterSize; j++)
1984
                        {
1985
                                int left= MAX(j + filterPos[i], 0);
1986
                                filter[i*(*filterSize) + left] += filter[i*(*filterSize) + j];
1987
                                filter[i*(*filterSize) + j]=0;
1988
                        }
1989
                        filterPos[i]= 0;
1990
                }
1991

    
1992
                if(filterPos[i] + (*filterSize) > srcW)
1993
                {
1994
                        int shift= filterPos[i] + (*filterSize) - srcW;
1995
                        // Move filter coeffs right to compensate for filterPos
1996
                        for(j=(*filterSize)-2; j>=0; j--)
1997
                        {
1998
                                int right= MIN(j + shift, (*filterSize)-1);
1999
                                filter[i*(*filterSize) +right] += filter[i*(*filterSize) +j];
2000
                                filter[i*(*filterSize) +j]=0;
2001
                        }
2002
                        filterPos[i]= srcW - (*filterSize);
2003
                }
2004
        }
2005

    
2006
        //FIXME try to align filterpos if possible / try to shift filterpos to put zeros at the end
2007
        // and skip these than later
2008

    
2009
        //Normalize
2010
        for(i=0; i<dstW; i++)
2011
        {
2012
                int j;
2013
                double sum=0;
2014
                double scale= one;
2015
                for(j=0; j<*filterSize; j++)
2016
                {
2017
                        sum+= filter[i*(*filterSize) + j];
2018
                }
2019
                scale/= sum;
2020
                for(j=0; j<*filterSize; j++)
2021
                {
2022
                        dstFilter[i*(*filterSize) + j]= (int)(filter[i*(*filterSize) + j]*scale);
2023
                }
2024
        }
2025
}
2026

    
2027
#ifdef HAVE_MMX2
2028
static void initMMX2HScaler(int dstW, int xInc, uint8_t *funnyCode)
2029
{
2030
        uint8_t *fragment;
2031
        int imm8OfPShufW1;
2032
        int imm8OfPShufW2;
2033
        int fragmentLength;
2034

    
2035
        int xpos, i;
2036

    
2037
        // create an optimized horizontal scaling routine
2038

    
2039
        //code fragment
2040

    
2041
        asm volatile(
2042
                "jmp 9f                                \n\t"
2043
        // Begin
2044
                "0:                                \n\t"
2045
                "movq (%%esi), %%mm0                \n\t" //FIXME Alignment
2046
                "movq %%mm0, %%mm1                \n\t"
2047
                "psrlq $8, %%mm0                \n\t"
2048
                "punpcklbw %%mm7, %%mm1        \n\t"
2049
                "movq %%mm2, %%mm3                \n\t"
2050
                "punpcklbw %%mm7, %%mm0        \n\t"
2051
                "addw %%bx, %%cx                \n\t" //2*xalpha += (4*lumXInc)&0xFFFF
2052
                "pshufw $0xFF, %%mm1, %%mm1        \n\t"
2053
                "1:                                \n\t"
2054
                "adcl %%edx, %%esi                \n\t" //xx+= (4*lumXInc)>>16 + carry
2055
                "pshufw $0xFF, %%mm0, %%mm0        \n\t"
2056
                "2:                                \n\t"
2057
                "psrlw $9, %%mm3                \n\t"
2058
                "psubw %%mm1, %%mm0                \n\t"
2059
                "pmullw %%mm3, %%mm0                \n\t"
2060
                "paddw %%mm6, %%mm2                \n\t" // 2*alpha += xpos&0xFFFF
2061
                "psllw $7, %%mm1                \n\t"
2062
                "paddw %%mm1, %%mm0                \n\t"
2063

    
2064
                "movq %%mm0, (%%edi, %%eax)        \n\t"
2065

    
2066
                "addl $8, %%eax                        \n\t"
2067
        // End
2068
                "9:                                \n\t"
2069
//                "int $3\n\t"
2070
                "leal 0b, %0                        \n\t"
2071
                "leal 1b, %1                        \n\t"
2072
                "leal 2b, %2                        \n\t"
2073
                "decl %1                        \n\t"
2074
                "decl %2                        \n\t"
2075
                "subl %0, %1                        \n\t"
2076
                "subl %0, %2                        \n\t"
2077
                "leal 9b, %3                        \n\t"
2078
                "subl %0, %3                        \n\t"
2079
                :"=r" (fragment), "=r" (imm8OfPShufW1), "=r" (imm8OfPShufW2),
2080
                "=r" (fragmentLength)
2081
        );
2082

    
2083
        xpos= 0; //lumXInc/2 - 0x8000; // difference between pixel centers
2084

    
2085
        for(i=0; i<dstW/8; i++)
2086
        {
2087
                int xx=xpos>>16;
2088

    
2089
                if((i&3) == 0)
2090
                {
2091
                        int a=0;
2092
                        int b=((xpos+xInc)>>16) - xx;
2093
                        int c=((xpos+xInc*2)>>16) - xx;
2094
                        int d=((xpos+xInc*3)>>16) - xx;
2095

    
2096
                        memcpy(funnyCode + fragmentLength*i/4, fragment, fragmentLength);
2097

    
2098
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW1]=
2099
                        funnyCode[fragmentLength*i/4 + imm8OfPShufW2]=
2100
                                a | (b<<2) | (c<<4) | (d<<6);
2101

    
2102
                        // if we dont need to read 8 bytes than dont :), reduces the chance of
2103
                        // crossing a cache line
2104
                        if(d<3) funnyCode[fragmentLength*i/4 + 1]= 0x6E;
2105

    
2106
                        funnyCode[fragmentLength*(i+4)/4]= RET;
2107
                }
2108
                xpos+=xInc;
2109
        }
2110
/*
2111
        xpos= 0; //chrXInc/2 - 0x10000; // difference between centers of chrom samples
2112
        for(i=0; i<dstUVw/8; i++)
2113
        {
2114
                int xx=xpos>>16;
2115

2116
                if((i&3) == 0)
2117
                {
2118
                        int a=0;
2119
                        int b=((xpos+chrXInc)>>16) - xx;
2120
                        int c=((xpos+chrXInc*2)>>16) - xx;
2121
                        int d=((xpos+chrXInc*3)>>16) - xx;
2122

2123
                        memcpy(funnyUVCode + fragmentLength*i/4, fragment, fragmentLength);
2124

2125
                        funnyUVCode[fragmentLength*i/4 + imm8OfPShufW1]=
2126
                        funnyUVCode[fragmentLength*i/4 + imm8OfPShufW2]=
2127
                                a | (b<<2) | (c<<4) | (d<<6);
2128

2129
                        // if we dont need to read 8 bytes than dont :), reduces the chance of
2130
                        // crossing a cache line
2131
                        if(d<3) funnyUVCode[fragmentLength*i/4 + 1]= 0x6E;
2132

2133
                        funnyUVCode[fragmentLength*(i+4)/4]= RET;
2134
                }
2135
                xpos+=chrXInc;
2136
        }
2137
*/
2138
//        funnyCode[0]= RET;
2139
}
2140
#endif // HAVE_MMX2
2141

    
2142
static void RENAME(SwScale_YV12slice)(unsigned char* srcptr[],int stride[], int srcSliceY ,
2143
                             int srcSliceH, uint8_t* dstptr[], int dststride, int dstbpp,
2144
                             int srcW, int srcH, int dstW, int dstH){
2145

    
2146

    
2147
unsigned int lumXInc= (srcW << 16) / dstW;
2148
unsigned int lumYInc= (srcH << 16) / dstH;
2149
unsigned int chrXInc;
2150
unsigned int chrYInc;
2151

    
2152
static int dstY;
2153

    
2154
// used to detect a size change
2155
static int oldDstW= -1;
2156
static int oldSrcW= -1;
2157
static int oldDstH= -1;
2158
static int oldSrcH= -1;
2159
static int oldFlags=-1;
2160

    
2161
static int lastInLumBuf;
2162
static int lastInChrBuf;
2163

    
2164
int chrDstW, chrDstH;
2165

    
2166
static int lumBufIndex=0;
2167
static int chrBufIndex=0;
2168

    
2169
static int firstTime=1;
2170

    
2171
const int widthAlign= dstbpp==12 ? 16 : 8;
2172
const int bytespp= (dstbpp+1)/8; //(12->1, 15&16->2, 24->3, 32->4)
2173
const int over= dstbpp==12 ?           (((dstW+15)&(~15))) - dststride
2174
                                : (((dstW+7)&(~7)))*bytespp - dststride;
2175
if(dststride%widthAlign !=0 )
2176
{
2177
        if(firstTime)
2178
                fprintf(stderr, "SwScaler: Warning: dstStride is not a multiple of %d!\n"
2179
                                "SwScaler:          ->cannot do aligned memory acesses anymore\n",
2180
                                widthAlign);
2181
}
2182

    
2183
if(over>0)
2184
{
2185
        if(firstTime)
2186
                fprintf(stderr, "SwScaler: Warning: output width is not a multiple of 8 (16 for YV12)\n"
2187
                                "SwScaler:          and dststride is not large enough to handle %d extra bytes\n"
2188
                                "SwScaler:          ->using unoptimized C version for last line(s)\n",
2189
                                over);
2190
}
2191

    
2192

    
2193

    
2194
//printf("%d %d %d %d\n", srcW, srcH, dstW, dstH);
2195
//printf("%d %d %d %d\n", lumXInc, lumYInc, srcSliceY, srcSliceH);
2196

    
2197
#ifdef HAVE_MMX2
2198
canMMX2BeUsed= (lumXInc <= 0x10000 && (dstW&31)==0 && (srcW&15)==0) ? 1 : 0;
2199
if(!canMMX2BeUsed && lumXInc <= 0x10000 && (srcW&15)==0 && sws_flags==SWS_FAST_BILINEAR)
2200
{
2201
        if(firstTime)
2202
                fprintf(stderr, "SwScaler: output Width is not a multiple of 32 -> no MMX2 scaler\n");
2203
}
2204
#else
2205
canMMX2BeUsed=0; // should be 0 anyway but ...
2206
#endif
2207

    
2208
if(firstTime)
2209
{
2210
#if defined (DITHER1XBPP) && defined (HAVE_MMX)
2211
        char *dither= " dithered";
2212
#else
2213
        char *dither= "";
2214
#endif
2215
        if(sws_flags==SWS_FAST_BILINEAR)
2216
                fprintf(stderr, "\nSwScaler: FAST_BILINEAR scaler ");
2217
        else if(sws_flags==SWS_BILINEAR)
2218
                fprintf(stderr, "\nSwScaler: BILINEAR scaler ");
2219
        else if(sws_flags==SWS_BICUBIC)
2220
                fprintf(stderr, "\nSwScaler: BICUBIC scaler ");
2221
        else
2222
                fprintf(stderr, "\nSwScaler: ehh flags invalid?! ");
2223

    
2224
        if(dstbpp==15)
2225
                fprintf(stderr, "with%s BGR15 output ", dither);
2226
        else if(dstbpp==16)
2227
                fprintf(stderr, "with%s BGR16 output ", dither);
2228
        else if(dstbpp==24)
2229
                fprintf(stderr, "with BGR24 output ");
2230
        else if(dstbpp==32)
2231
                fprintf(stderr, "with BGR32 output ");
2232
        else if(dstbpp==12)
2233
                fprintf(stderr, "with YV12 output ");
2234
        else
2235
                fprintf(stderr, "without output ");
2236

    
2237
#ifdef HAVE_MMX2
2238
                fprintf(stderr, "using MMX2\n");
2239
#elif defined (HAVE_3DNOW)
2240
                fprintf(stderr, "using 3DNOW\n");
2241
#elif defined (HAVE_MMX)
2242
                fprintf(stderr, "using MMX\n");
2243
#elif defined (ARCH_X86)
2244
                fprintf(stderr, "using X86 ASM\n");
2245
#else
2246
                fprintf(stderr, "using C\n");
2247
#endif
2248
}
2249

    
2250

    
2251
// match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src to pixel n-2 of dst
2252
// n-2 is the last chrominance sample available
2253
// this is not perfect, but noone shuld notice the difference, the more correct variant
2254
// would be like the vertical one, but that would require some special code for the
2255
// first and last pixel
2256
if(sws_flags==SWS_FAST_BILINEAR)
2257
{
2258
        if(canMMX2BeUsed)         lumXInc+= 20;
2259
#ifndef HAVE_MMX //we dont use the x86asm scaler if mmx is available
2260
        else                        lumXInc = ((srcW-2)<<16)/(dstW-2) - 20;
2261
#endif
2262
}
2263

    
2264
if(fullUVIpol && !(dstbpp==12))         chrXInc= lumXInc>>1, chrDstW= dstW;
2265
else                                        chrXInc= lumXInc,    chrDstW= (dstW+1)>>1;
2266

    
2267
if(dstbpp==12)        chrYInc= lumYInc,    chrDstH= (dstH+1)>>1;
2268
else                chrYInc= lumYInc>>1, chrDstH= dstH;
2269

    
2270
  // force calculation of the horizontal interpolation of the first line
2271

    
2272
  if(srcSliceY ==0){
2273
//        printf("dstW %d, srcw %d, mmx2 %d\n", dstW, srcW, canMMX2BeUsed);
2274
        lumBufIndex=0;
2275
        chrBufIndex=0;
2276
        dstY=0;
2277

    
2278
        //precalculate horizontal scaler filter coefficients
2279
        if(oldDstW!=dstW || oldSrcW!=srcW || oldFlags!=sws_flags)
2280
        {
2281
#ifdef HAVE_MMX
2282
                const int filterAlign=4;
2283
#else
2284
                const int filterAlign=1;
2285
#endif
2286
                oldDstW= dstW; oldSrcW= srcW; oldFlags= sws_flags;
2287

    
2288
                RENAME(initFilter)(hLumFilter, hLumFilterPos, &hLumFilterSize, lumXInc,
2289
                                srcW   , dstW   , filterAlign, 1<<14);
2290
                RENAME(initFilter)(hChrFilter, hChrFilterPos, &hChrFilterSize, chrXInc,
2291
                                (srcW+1)>>1, chrDstW, filterAlign, 1<<14);
2292

    
2293
#ifdef HAVE_MMX2
2294
// cant downscale !!!
2295
                if(canMMX2BeUsed && sws_flags == SWS_FAST_BILINEAR)
2296
                {
2297
                        initMMX2HScaler(dstW   , lumXInc, funnyYCode);
2298
                        initMMX2HScaler(chrDstW, chrXInc, funnyUVCode);
2299
                }
2300
#endif
2301
        } // Init Horizontal stuff
2302

    
2303
        if(oldDstH!=dstH || oldSrcH!=srcH || oldFlags!=sws_flags)
2304
        {
2305
                int i;
2306
                oldDstH= dstH; oldSrcH= srcH; oldFlags= sws_flags; //FIXME swsflags conflict with x check
2307

    
2308
                // deallocate pixbufs
2309
                for(i=0; i<vLumBufSize; i++) free(lumPixBuf[i]);
2310
                for(i=0; i<vChrBufSize; i++) free(chrPixBuf[i]);
2311

    
2312
                RENAME(initFilter)(vLumFilter, vLumFilterPos, &vLumFilterSize, lumYInc,
2313
                                srcH   , dstH,    1, (1<<12)-4);
2314
                RENAME(initFilter)(vChrFilter, vChrFilterPos, &vChrFilterSize, chrYInc,
2315
                                (srcH+1)>>1, chrDstH, 1, (1<<12)-4);
2316

    
2317
                // Calculate Buffer Sizes so that they wont run out while handling these damn slices
2318
                vLumBufSize= vLumFilterSize; vChrBufSize= vChrFilterSize;
2319
                for(i=0; i<dstH; i++)
2320
                {
2321
                        int chrI= i*chrDstH / dstH;
2322
                        int nextSlice= MAX(vLumFilterPos[i   ] + vLumFilterSize - 1,
2323
                                         ((vChrFilterPos[chrI] + vChrFilterSize - 1)<<1));
2324
                        nextSlice&= ~1; // Slices start at even boundaries
2325
                        if(vLumFilterPos[i   ] + vLumBufSize < nextSlice)
2326
                                vLumBufSize= nextSlice - vLumFilterPos[i   ];
2327
                        if(vChrFilterPos[chrI] + vChrBufSize < (nextSlice>>1))
2328
                                vChrBufSize= (nextSlice>>1) - vChrFilterPos[chrI];
2329
                }
2330

    
2331
                // allocate pixbufs (we use dynamic allocation because otherwise we would need to
2332
                // allocate several megabytes to handle all possible cases)
2333
                for(i=0; i<vLumBufSize; i++)
2334
                        lumPixBuf[i]= lumPixBuf[i+vLumBufSize]= (uint16_t*)memalign(8, 4000);
2335
                for(i=0; i<vChrBufSize; i++)
2336
                        chrPixBuf[i]= chrPixBuf[i+vChrBufSize]= (uint16_t*)memalign(8, 8000);
2337

    
2338
                //try to avoid drawing green stuff between the right end and the stride end
2339
                for(i=0; i<vLumBufSize; i++) memset(lumPixBuf[i], 0, 4000);
2340
                for(i=0; i<vChrBufSize; i++) memset(chrPixBuf[i], 64, 8000);
2341

    
2342
                ASSERT(chrDstH<=dstH)
2343
                ASSERT(vLumFilterSize*dstH*4<16000)
2344
                ASSERT(vChrFilterSize*chrDstH*4<16000)
2345
#ifdef HAVE_MMX
2346
                // pack filter data for mmx code
2347
                for(i=0; i<vLumFilterSize*dstH; i++)
2348
                        lumMmxFilter[4*i]=lumMmxFilter[4*i+1]=lumMmxFilter[4*i+2]=lumMmxFilter[4*i+3]=
2349
                                vLumFilter[i];
2350
                for(i=0; i<vChrFilterSize*chrDstH; i++)
2351
                        chrMmxFilter[4*i]=chrMmxFilter[4*i+1]=chrMmxFilter[4*i+2]=chrMmxFilter[4*i+3]=
2352
                                vChrFilter[i];
2353
#endif
2354
        }
2355

    
2356
        if(firstTime && verbose)
2357
        {
2358
#ifdef HAVE_MMX2
2359
                int mmx2=1;
2360
#else
2361
                int mmx2=0;
2362
#endif
2363
#ifdef HAVE_MMX
2364
                int mmx=1;
2365
#else
2366
                int mmx=0;
2367
#endif
2368

    
2369
#ifdef HAVE_MMX
2370
                if(canMMX2BeUsed && sws_flags==SWS_FAST_BILINEAR)
2371
                        printf("SwScaler: using FAST_BILINEAR MMX2 scaler for horizontal scaling\n");
2372
                else
2373
                {
2374
                        if(hLumFilterSize==4)
2375
                                printf("SwScaler: using 4-tap MMX scaler for horizontal luminance scaling\n");
2376
                        else if(hLumFilterSize==8)
2377
                                printf("SwScaler: using 8-tap MMX scaler for horizontal luminance scaling\n");
2378
                        else
2379
                                printf("SwScaler: using n-tap MMX scaler for horizontal luminance scaling\n");
2380

    
2381
                        if(hChrFilterSize==4)
2382
                                printf("SwScaler: using 4-tap MMX scaler for horizontal chrominance scaling\n");
2383
                        else if(hChrFilterSize==8)
2384
                                printf("SwScaler: using 8-tap MMX scaler for horizontal chrominance scaling\n");
2385
                        else
2386
                                printf("SwScaler: using n-tap MMX scaler for horizontal chrominance scaling\n");
2387
                }
2388
#elif defined (ARCH_X86)
2389
                printf("SwScaler: using X86-Asm scaler for horizontal scaling\n");
2390
#else
2391
                if(sws_flags==SWS_FAST_BILINEAR)
2392
                        printf("SwScaler: using FAST_BILINEAR C scaler for horizontal scaling\n");
2393
                else
2394
                        printf("SwScaler: using C scaler for horizontal scaling\n");
2395
#endif
2396

    
2397
                if(dstbpp==12)
2398
                {
2399
                        if(vLumFilterSize==1)
2400
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2401
                        else
2402
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (YV12)\n", mmx ? "MMX" : "C");
2403
                }
2404
                else
2405
                {
2406
                        if(vLumFilterSize==1 && vChrFilterSize==2)
2407
                                printf("SwScaler: using 1-tap %s \"scaler\" for vertical luminance scaling (BGR)\n"
2408
                                       "SwScaler:       2-tap scaler for vertical chrominance scaling (BGR)\n", mmx ? "MMX" : "C");
2409
                        else if(vLumFilterSize==2 && vChrFilterSize==2)
2410
                                printf("SwScaler: using 2-tap linear %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2411
                        else
2412
                                printf("SwScaler: using n-tap %s scaler for vertical scaling (BGR)\n", mmx ? "MMX" : "C");
2413
                }
2414

    
2415
                if(dstbpp==24)
2416
                        printf("SwScaler: using %s YV12->BGR24 Converter\n",
2417
                                mmx2 ? "MMX2" : (mmx ? "MMX" : "C"));
2418
                else
2419
                        printf("SwScaler: using %s YV12->BGR%d Converter\n", mmx ? "MMX" : "C", dstbpp);
2420

    
2421
                printf("SwScaler: %dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
2422
        }
2423

    
2424
        lastInLumBuf= -1;
2425
        lastInChrBuf= -1;
2426
  } // if(firstLine)
2427

    
2428
        for(;dstY < dstH; dstY++){
2429
                unsigned char *dest =dstptr[0]+dststride*dstY;
2430
                unsigned char *uDest=dstptr[1]+(dststride>>1)*(dstY>>1);
2431
                unsigned char *vDest=dstptr[2]+(dststride>>1)*(dstY>>1);
2432
                const int chrDstY= dstbpp==12 ? (dstY>>1) : dstY;
2433

    
2434
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2435
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2436
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2437
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2438

    
2439
                if(sws_flags == SWS_FAST_BILINEAR)
2440
                {
2441
                        //handle holes
2442
                        if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2443
                        if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2444
                }
2445

    
2446
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2447
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2448

    
2449
                // Do we have enough lines in this slice to output the dstY line
2450
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2451
                {
2452
                        //Do horizontal scaling
2453
                        while(lastInLumBuf < lastLumSrcY)
2454
                        {
2455
                                uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2456
                                lumBufIndex++;
2457
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2458
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2459
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2460
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2461
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2462
                                lastInLumBuf++;
2463
                        }
2464
                        while(lastInChrBuf < lastChrSrcY)
2465
                        {
2466
                                uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2467
                                uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2468
                                chrBufIndex++;
2469
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2470
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2471
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2472
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2473
                                lastInChrBuf++;
2474
                        }
2475
                        //wrap buf index around to stay inside the ring buffer
2476
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2477
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2478
                }
2479
                else // not enough lines left in this slice -> load the rest in the buffer
2480
                {
2481
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2482
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2483
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2484
                        vChrBufSize, vLumBufSize);
2485
*/
2486
                        //Do horizontal scaling
2487
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2488
                        {
2489
                                uint8_t *src= srcptr[0]+(lastInLumBuf + 1 - srcSliceY)*stride[0];
2490
                                lumBufIndex++;
2491
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2492
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2493
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2494
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, src, srcW, lumXInc);
2495
                                lastInLumBuf++;
2496
                        }
2497
                        while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2498
                        {
2499
                                uint8_t *src1= srcptr[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[1];
2500
                                uint8_t *src2= srcptr[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*stride[2];
2501
                                chrBufIndex++;
2502
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2503
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2504
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2505
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc);
2506
                                lastInChrBuf++;
2507
                        }
2508
                        //wrap buf index around to stay inside the ring buffer
2509
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2510
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2511
                        break; //we cant output a dstY line so lets try with the next slice
2512
                }
2513

    
2514
#ifdef HAVE_MMX
2515
                b5Dither= dither8[dstY&1];
2516
                g6Dither= dither4[dstY&1];
2517
                g5Dither= dither8[dstY&1];
2518
                r5Dither= dither8[(dstY+1)&1];
2519
#endif
2520
            if(dstY < dstH-2 || over<=0)
2521
            {
2522
                if(dstbpp==12) //YV12
2523
                {
2524
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2525
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2526
                        {
2527
                                int16_t *lumBuf = lumPixBuf[0];
2528
                                int16_t *chrBuf= chrPixBuf[0];
2529
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2530
                        }
2531
                        else //General YV12
2532
                        {
2533
                                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2534
                                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2535
                                RENAME(yuv2yuvX)(
2536
                                        vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2537
                                        vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2538
                                        dest, uDest, vDest, dstW,
2539
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2540
                        }
2541
                }
2542
                else
2543
                {
2544
                        int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2545
                        int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2546

    
2547
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2548
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2549
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2550
                        {
2551
                                int chrAlpha= vChrFilter[2*dstY+1];
2552

    
2553
                                RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2554
                                                 dest, dstW, chrAlpha, dstbpp);
2555
                        }
2556
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2557
                        {
2558
                                int lumAlpha= vLumFilter[2*dstY+1];
2559
                                int chrAlpha= vChrFilter[2*dstY+1];
2560

    
2561
                                RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2562
                                                 dest, dstW, lumAlpha, chrAlpha, dstbpp);
2563
                        }
2564
                        else //General RGB
2565
                        {
2566
                                RENAME(yuv2rgbX)(
2567
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2568
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2569
                                        dest, dstW, dstbpp,
2570
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2571
                        }
2572
                }
2573
            }
2574
            else // hmm looks like we cant use MMX here without overwriting this arrays tail
2575
            {
2576
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2577
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2578
                if(dstbpp==12) //YV12
2579
                {
2580
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2581
                        yuv2yuvXinC(
2582
                                vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2583
                                vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2584
                                dest, uDest, vDest, dstW);
2585
                }
2586
                else
2587
                {
2588
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2589
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2590
                        yuv2rgbXinC(
2591
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2592
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2593
                                dest, dstW, dstbpp);
2594
                }
2595
            }
2596
        }
2597

    
2598
#ifdef HAVE_MMX
2599
        __asm __volatile(SFENCE:::"memory");
2600
        __asm __volatile(EMMS:::"memory");
2601
#endif
2602
        firstTime=0;
2603
}