Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ ac6a2e45

History | View | Annotate | Download (80.7 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef MOVNTQ
20
#undef PAVGB
21
#undef PREFETCH
22
#undef PREFETCHW
23
#undef EMMS
24
#undef SFENCE
25

    
26
#ifdef HAVE_3DNOW
27
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28
#define EMMS     "femms"
29
#else
30
#define EMMS     "emms"
31
#endif
32

    
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#elif defined ( HAVE_MMX2 )
37
#define PREFETCH "prefetchnta"
38
#define PREFETCHW "prefetcht0"
39
#else
40
#define PREFETCH "/nop"
41
#define PREFETCHW "/nop"
42
#endif
43

    
44
#ifdef HAVE_MMX2
45
#define SFENCE "sfence"
46
#else
47
#define SFENCE "/nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52
#elif defined (HAVE_3DNOW)
53
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58
#else
59
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60
#endif
61

    
62

    
63
#define YSCALEYUV2YV12X(x) \
64
                        "xorl %%eax, %%eax                \n\t"\
65
                        "pxor %%mm3, %%mm3                \n\t"\
66
                        "pxor %%mm4, %%mm4                \n\t"\
67
                        "movl %0, %%edx                        \n\t"\
68
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
69
                        "1:                                \n\t"\
70
                        "movl (%1, %%edx, 4), %%esi        \n\t"\
71
                        "movq (%2, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
72
                        "movq " #x "(%%esi, %%eax, 2), %%mm2        \n\t" /* srcData */\
73
                        "movq 8+" #x "(%%esi, %%eax, 2), %%mm5        \n\t" /* srcData */\
74
                        "pmulhw %%mm0, %%mm2                \n\t"\
75
                        "pmulhw %%mm0, %%mm5                \n\t"\
76
                        "paddw %%mm2, %%mm3                \n\t"\
77
                        "paddw %%mm5, %%mm4                \n\t"\
78
                        "addl $1, %%edx                        \n\t"\
79
                        " jnz 1b                        \n\t"\
80
                        "psraw $3, %%mm3                \n\t"\
81
                        "psraw $3, %%mm4                \n\t"\
82
                        "packuswb %%mm4, %%mm3                \n\t"\
83
                        MOVNTQ(%%mm3, (%3, %%eax))\
84
                        "addl $8, %%eax                        \n\t"\
85
                        "cmpl %4, %%eax                        \n\t"\
86
                        "pxor %%mm3, %%mm3                \n\t"\
87
                        "pxor %%mm4, %%mm4                \n\t"\
88
                        "movl %0, %%edx                        \n\t"\
89
                        "jb 1b                                \n\t"
90

    
91
#define YSCALEYUV2YV121 \
92
                        "movl %2, %%eax                        \n\t"\
93
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
94
                        "1:                                \n\t"\
95
                        "movq (%0, %%eax, 2), %%mm0        \n\t"\
96
                        "movq 8(%0, %%eax, 2), %%mm1        \n\t"\
97
                        "psraw $7, %%mm0                \n\t"\
98
                        "psraw $7, %%mm1                \n\t"\
99
                        "packuswb %%mm1, %%mm0                \n\t"\
100
                        MOVNTQ(%%mm0, (%1, %%eax))\
101
                        "addl $8, %%eax                        \n\t"\
102
                        "jnc 1b                                \n\t"
103

    
104
/*
105
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
106
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
107
                           "r" (dest), "m" (dstW),
108
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
109
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
110
*/
111
#define YSCALEYUV2RGBX \
112
                "xorl %%eax, %%eax                \n\t"\
113
                ".balign 16                        \n\t"\
114
                "1:                                \n\t"\
115
                "movl %1, %%edx                        \n\t" /* -chrFilterSize */\
116
                "movl %3, %%ebx                        \n\t" /* chrMmxFilter+lumFilterSize */\
117
                "movl %7, %%ecx                        \n\t" /* chrSrc+lumFilterSize */\
118
                "pxor %%mm3, %%mm3                \n\t"\
119
                "pxor %%mm4, %%mm4                \n\t"\
120
                "2:                                \n\t"\
121
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
122
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
123
                "movq (%%esi, %%eax), %%mm2        \n\t" /* UsrcData */\
124
                "movq 4096(%%esi, %%eax), %%mm5        \n\t" /* VsrcData */\
125
                "pmulhw %%mm0, %%mm2                \n\t"\
126
                "pmulhw %%mm0, %%mm5                \n\t"\
127
                "paddw %%mm2, %%mm3                \n\t"\
128
                "paddw %%mm5, %%mm4                \n\t"\
129
                "addl $1, %%edx                        \n\t"\
130
                " jnz 2b                        \n\t"\
131
\
132
                "movl %0, %%edx                        \n\t" /* -lumFilterSize */\
133
                "movl %2, %%ebx                        \n\t" /* lumMmxFilter+lumFilterSize */\
134
                "movl %6, %%ecx                        \n\t" /* lumSrc+lumFilterSize */\
135
                "pxor %%mm1, %%mm1                \n\t"\
136
                "pxor %%mm7, %%mm7                \n\t"\
137
                "2:                                \n\t"\
138
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
139
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
140
                "movq (%%esi, %%eax, 2), %%mm2        \n\t" /* Y1srcData */\
141
                "movq 8(%%esi, %%eax, 2), %%mm5        \n\t" /* Y2srcData */\
142
                "pmulhw %%mm0, %%mm2                \n\t"\
143
                "pmulhw %%mm0, %%mm5                \n\t"\
144
                "paddw %%mm2, %%mm1                \n\t"\
145
                "paddw %%mm5, %%mm7                \n\t"\
146
                "addl $1, %%edx                        \n\t"\
147
                " jnz 2b                        \n\t"\
148
\
149
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
150
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
151
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
152
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
153
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
154
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
155
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
156
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
157
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
158
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
159
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
160
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
161
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
162
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
163
                "paddw %%mm3, %%mm4                \n\t"\
164
                "movq %%mm2, %%mm0                \n\t"\
165
                "movq %%mm5, %%mm6                \n\t"\
166
                "movq %%mm4, %%mm3                \n\t"\
167
                "punpcklwd %%mm2, %%mm2                \n\t"\
168
                "punpcklwd %%mm5, %%mm5                \n\t"\
169
                "punpcklwd %%mm4, %%mm4                \n\t"\
170
                "paddw %%mm1, %%mm2                \n\t"\
171
                "paddw %%mm1, %%mm5                \n\t"\
172
                "paddw %%mm1, %%mm4                \n\t"\
173
                "punpckhwd %%mm0, %%mm0                \n\t"\
174
                "punpckhwd %%mm6, %%mm6                \n\t"\
175
                "punpckhwd %%mm3, %%mm3                \n\t"\
176
                "paddw %%mm7, %%mm0                \n\t"\
177
                "paddw %%mm7, %%mm6                \n\t"\
178
                "paddw %%mm7, %%mm3                \n\t"\
179
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
180
                "packuswb %%mm0, %%mm2                \n\t"\
181
                "packuswb %%mm6, %%mm5                \n\t"\
182
                "packuswb %%mm3, %%mm4                \n\t"\
183
                "pxor %%mm7, %%mm7                \n\t"
184

    
185
#define FULL_YSCALEYUV2RGB \
186
                "pxor %%mm7, %%mm7                \n\t"\
187
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
188
                "punpcklwd %%mm6, %%mm6                \n\t"\
189
                "punpcklwd %%mm6, %%mm6                \n\t"\
190
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
191
                "punpcklwd %%mm5, %%mm5                \n\t"\
192
                "punpcklwd %%mm5, %%mm5                \n\t"\
193
                "xorl %%eax, %%eax                \n\t"\
194
                ".balign 16                        \n\t"\
195
                "1:                                \n\t"\
196
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
197
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
198
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
199
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
200
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
201
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
202
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
203
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
204
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
205
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
206
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
207
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
208
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
209
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
210
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
211
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
212
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
213
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
214
\
215
\
216
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
217
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
218
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
219
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
220
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
221
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
222
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
223
\
224
\
225
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
226
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
227
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
228
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
229
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
230
                "packuswb %%mm3, %%mm3                \n\t"\
231
\
232
                "packuswb %%mm0, %%mm0                \n\t"\
233
                "paddw %%mm4, %%mm2                \n\t"\
234
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
235
\
236
                "packuswb %%mm1, %%mm1                \n\t"
237

    
238
#define YSCALEYUV2RGB \
239
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
240
                "punpcklwd %%mm6, %%mm6                \n\t"\
241
                "punpcklwd %%mm6, %%mm6                \n\t"\
242
                "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
243
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
244
                "punpcklwd %%mm5, %%mm5                \n\t"\
245
                "punpcklwd %%mm5, %%mm5                \n\t"\
246
                "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
247
                "xorl %%eax, %%eax                \n\t"\
248
                ".balign 16                        \n\t"\
249
                "1:                                \n\t"\
250
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
251
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
252
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
253
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
254
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
255
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
256
                "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
257
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
258
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
259
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
260
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
261
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
262
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
263
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
264
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
265
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
266
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
267
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
268
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
269
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
270
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
271
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
272
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
273
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
274
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
275
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
276
                "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277
                "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
278
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
280
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
282
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
283
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
284
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
285
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
286
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
287
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
288
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289
                "paddw %%mm3, %%mm4                \n\t"\
290
                "movq %%mm2, %%mm0                \n\t"\
291
                "movq %%mm5, %%mm6                \n\t"\
292
                "movq %%mm4, %%mm3                \n\t"\
293
                "punpcklwd %%mm2, %%mm2                \n\t"\
294
                "punpcklwd %%mm5, %%mm5                \n\t"\
295
                "punpcklwd %%mm4, %%mm4                \n\t"\
296
                "paddw %%mm1, %%mm2                \n\t"\
297
                "paddw %%mm1, %%mm5                \n\t"\
298
                "paddw %%mm1, %%mm4                \n\t"\
299
                "punpckhwd %%mm0, %%mm0                \n\t"\
300
                "punpckhwd %%mm6, %%mm6                \n\t"\
301
                "punpckhwd %%mm3, %%mm3                \n\t"\
302
                "paddw %%mm7, %%mm0                \n\t"\
303
                "paddw %%mm7, %%mm6                \n\t"\
304
                "paddw %%mm7, %%mm3                \n\t"\
305
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306
                "packuswb %%mm0, %%mm2                \n\t"\
307
                "packuswb %%mm6, %%mm5                \n\t"\
308
                "packuswb %%mm3, %%mm4                \n\t"\
309
                "pxor %%mm7, %%mm7                \n\t"
310

    
311
#define YSCALEYUV2RGB1 \
312
                "xorl %%eax, %%eax                \n\t"\
313
                ".balign 16                        \n\t"\
314
                "1:                                \n\t"\
315
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
316
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
317
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
318
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
319
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
320
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
321
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
322
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
323
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
324
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
325
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
326
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
327
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
328
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
330
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
331
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
332
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
333
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
334
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
335
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
336
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
337
                "paddw %%mm3, %%mm4                \n\t"\
338
                "movq %%mm2, %%mm0                \n\t"\
339
                "movq %%mm5, %%mm6                \n\t"\
340
                "movq %%mm4, %%mm3                \n\t"\
341
                "punpcklwd %%mm2, %%mm2                \n\t"\
342
                "punpcklwd %%mm5, %%mm5                \n\t"\
343
                "punpcklwd %%mm4, %%mm4                \n\t"\
344
                "paddw %%mm1, %%mm2                \n\t"\
345
                "paddw %%mm1, %%mm5                \n\t"\
346
                "paddw %%mm1, %%mm4                \n\t"\
347
                "punpckhwd %%mm0, %%mm0                \n\t"\
348
                "punpckhwd %%mm6, %%mm6                \n\t"\
349
                "punpckhwd %%mm3, %%mm3                \n\t"\
350
                "paddw %%mm7, %%mm0                \n\t"\
351
                "paddw %%mm7, %%mm6                \n\t"\
352
                "paddw %%mm7, %%mm3                \n\t"\
353
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
354
                "packuswb %%mm0, %%mm2                \n\t"\
355
                "packuswb %%mm6, %%mm5                \n\t"\
356
                "packuswb %%mm3, %%mm4                \n\t"\
357
                "pxor %%mm7, %%mm7                \n\t"
358

    
359
// do vertical chrominance interpolation
360
#define YSCALEYUV2RGB1b \
361
                "xorl %%eax, %%eax                \n\t"\
362
                ".balign 16                        \n\t"\
363
                "1:                                \n\t"\
364
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
365
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
366
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
367
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
368
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
369
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
370
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
371
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
372
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
373
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
374
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
375
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
376
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
377
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
378
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
379
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
380
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
381
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
384
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
385
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
386
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
387
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
388
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
389
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
390
                "paddw %%mm3, %%mm4                \n\t"\
391
                "movq %%mm2, %%mm0                \n\t"\
392
                "movq %%mm5, %%mm6                \n\t"\
393
                "movq %%mm4, %%mm3                \n\t"\
394
                "punpcklwd %%mm2, %%mm2                \n\t"\
395
                "punpcklwd %%mm5, %%mm5                \n\t"\
396
                "punpcklwd %%mm4, %%mm4                \n\t"\
397
                "paddw %%mm1, %%mm2                \n\t"\
398
                "paddw %%mm1, %%mm5                \n\t"\
399
                "paddw %%mm1, %%mm4                \n\t"\
400
                "punpckhwd %%mm0, %%mm0                \n\t"\
401
                "punpckhwd %%mm6, %%mm6                \n\t"\
402
                "punpckhwd %%mm3, %%mm3                \n\t"\
403
                "paddw %%mm7, %%mm0                \n\t"\
404
                "paddw %%mm7, %%mm6                \n\t"\
405
                "paddw %%mm7, %%mm3                \n\t"\
406
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
407
                "packuswb %%mm0, %%mm2                \n\t"\
408
                "packuswb %%mm6, %%mm5                \n\t"\
409
                "packuswb %%mm3, %%mm4                \n\t"\
410
                "pxor %%mm7, %%mm7                \n\t"
411

    
412
#define WRITEBGR32 \
413
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
414
                        "movq %%mm2, %%mm1                \n\t" /* B */\
415
                        "movq %%mm5, %%mm6                \n\t" /* R */\
416
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
417
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
418
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
419
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
420
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
421
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
422
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
423
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
424
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
425
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
426
\
427
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
428
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
429
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
430
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
431
\
432
                        "addl $8, %%eax                        \n\t"\
433
                        "cmpl %5, %%eax                        \n\t"\
434
                        " jb 1b                                \n\t"
435

    
436
#define WRITEBGR16 \
437
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
438
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
439
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
440
                        "psrlq $3, %%mm2                \n\t"\
441
\
442
                        "movq %%mm2, %%mm1                \n\t"\
443
                        "movq %%mm4, %%mm3                \n\t"\
444
\
445
                        "punpcklbw %%mm7, %%mm3                \n\t"\
446
                        "punpcklbw %%mm5, %%mm2                \n\t"\
447
                        "punpckhbw %%mm7, %%mm4                \n\t"\
448
                        "punpckhbw %%mm5, %%mm1                \n\t"\
449
\
450
                        "psllq $3, %%mm3                \n\t"\
451
                        "psllq $3, %%mm4                \n\t"\
452
\
453
                        "por %%mm3, %%mm2                \n\t"\
454
                        "por %%mm4, %%mm1                \n\t"\
455
\
456
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
457
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
458
\
459
                        "addl $8, %%eax                        \n\t"\
460
                        "cmpl %5, %%eax                        \n\t"\
461
                        " jb 1b                                \n\t"
462

    
463
#define WRITEBGR15 \
464
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
465
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
466
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
467
                        "psrlq $3, %%mm2                \n\t"\
468
                        "psrlq $1, %%mm5                \n\t"\
469
\
470
                        "movq %%mm2, %%mm1                \n\t"\
471
                        "movq %%mm4, %%mm3                \n\t"\
472
\
473
                        "punpcklbw %%mm7, %%mm3                \n\t"\
474
                        "punpcklbw %%mm5, %%mm2                \n\t"\
475
                        "punpckhbw %%mm7, %%mm4                \n\t"\
476
                        "punpckhbw %%mm5, %%mm1                \n\t"\
477
\
478
                        "psllq $2, %%mm3                \n\t"\
479
                        "psllq $2, %%mm4                \n\t"\
480
\
481
                        "por %%mm3, %%mm2                \n\t"\
482
                        "por %%mm4, %%mm1                \n\t"\
483
\
484
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
485
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
486
\
487
                        "addl $8, %%eax                        \n\t"\
488
                        "cmpl %5, %%eax                        \n\t"\
489
                        " jb 1b                                \n\t"
490

    
491
#define WRITEBGR24OLD \
492
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493
                        "movq %%mm2, %%mm1                \n\t" /* B */\
494
                        "movq %%mm5, %%mm6                \n\t" /* R */\
495
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
496
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
497
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
498
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
499
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
500
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
501
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
502
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
503
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
504
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
505
\
506
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
507
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
508
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
509
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
510
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
511
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
512
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
513
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
514
\
515
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
516
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
517
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
518
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
519
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
520
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
521
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
522
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
523
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
524
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
525
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
526
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
527
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
528
\
529
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
530
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
531
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
532
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
533
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
534
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
535
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
536
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
537
\
538
                        MOVNTQ(%%mm0, (%%ebx))\
539
                        MOVNTQ(%%mm2, 8(%%ebx))\
540
                        MOVNTQ(%%mm3, 16(%%ebx))\
541
                        "addl $24, %%ebx                \n\t"\
542
\
543
                        "addl $8, %%eax                        \n\t"\
544
                        "cmpl %5, %%eax                        \n\t"\
545
                        " jb 1b                                \n\t"
546

    
547
#define WRITEBGR24MMX \
548
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
549
                        "movq %%mm2, %%mm1                \n\t" /* B */\
550
                        "movq %%mm5, %%mm6                \n\t" /* R */\
551
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
552
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
553
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
554
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
555
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
556
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
557
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
558
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
559
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
560
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
561
\
562
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
563
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
564
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
565
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
566
\
567
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
568
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
569
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
570
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
571
\
572
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
573
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
574
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
575
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
576
\
577
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
578
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
579
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
580
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
581
                        MOVNTQ(%%mm0, (%%ebx))\
582
\
583
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
584
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
585
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
586
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
587
                        MOVNTQ(%%mm6, 8(%%ebx))\
588
\
589
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
590
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
591
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
592
                        MOVNTQ(%%mm5, 16(%%ebx))\
593
\
594
                        "addl $24, %%ebx                \n\t"\
595
\
596
                        "addl $8, %%eax                        \n\t"\
597
                        "cmpl %5, %%eax                        \n\t"\
598
                        " jb 1b                                \n\t"
599

    
600
#define WRITEBGR24MMX2 \
601
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
602
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
603
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
604
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
605
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
606
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
607
\
608
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
609
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
610
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
611
\
612
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
613
                        "por %%mm1, %%mm6                \n\t"\
614
                        "por %%mm3, %%mm6                \n\t"\
615
                        MOVNTQ(%%mm6, (%%ebx))\
616
\
617
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
618
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
619
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
620
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
621
\
622
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
623
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
624
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
625
\
626
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
627
                        "por %%mm3, %%mm6                \n\t"\
628
                        MOVNTQ(%%mm6, 8(%%ebx))\
629
\
630
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
631
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
632
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
633
\
634
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
635
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
636
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
637
\
638
                        "por %%mm1, %%mm3                \n\t"\
639
                        "por %%mm3, %%mm6                \n\t"\
640
                        MOVNTQ(%%mm6, 16(%%ebx))\
641
\
642
                        "addl $24, %%ebx                \n\t"\
643
\
644
                        "addl $8, %%eax                        \n\t"\
645
                        "cmpl %5, %%eax                        \n\t"\
646
                        " jb 1b                                \n\t"
647

    
648
#ifdef HAVE_MMX2
649
#undef WRITEBGR24
650
#define WRITEBGR24 WRITEBGR24MMX2
651
#else
652
#undef WRITEBGR24
653
#define WRITEBGR24 WRITEBGR24MMX
654
#endif
655

    
656
static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
657
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
658
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
659
                                    int16_t * lumMmxFilter, int16_t * chrMmxFilter)
660
{
661
#ifdef HAVE_MMX
662
        if(uDest != NULL)
663
        {
664
                asm volatile(
665
                                YSCALEYUV2YV12X(0)
666
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
667
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
668
                                : "%eax", "%edx", "%esi"
669
                        );
670

    
671
                asm volatile(
672
                                YSCALEYUV2YV12X(4096)
673
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
674
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
675
                                : "%eax", "%edx", "%esi"
676
                        );
677
        }
678

    
679
        asm volatile(
680
                        YSCALEYUV2YV12X(0)
681
                        :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
682
                           "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
683
                        : "%eax", "%edx", "%esi"
684
                );
685
#else
686
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
687
            chrFilter, chrSrc, chrFilterSize,
688
            dest, uDest, vDest, dstW);
689
#endif
690
}
691

    
692
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
693
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
694
{
695
#ifdef HAVE_MMX
696
        if(uDest != NULL)
697
        {
698
                asm volatile(
699
                                YSCALEYUV2YV121
700
                                :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
701
                                "g" (-(dstW>>1))
702
                                : "%eax"
703
                        );
704

    
705
                asm volatile(
706
                                YSCALEYUV2YV121
707
                                :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
708
                                "g" (-(dstW>>1))
709
                                : "%eax"
710
                        );
711
        }
712

    
713
        asm volatile(
714
                YSCALEYUV2YV121
715
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
716
                "g" (-dstW)
717
                : "%eax"
718
        );
719
#else
720
        //FIXME Optimize (just quickly writen not opti..)
721
        //FIXME replace MINMAX with LUTs
722
        int i;
723
        for(i=0; i<dstW; i++)
724
        {
725
                int val= lumSrc[i]>>7;
726

    
727
                dest[i]= MIN(MAX(val>>19, 0), 255);
728
        }
729

    
730
        if(uDest != NULL)
731
                for(i=0; i<(dstW>>1); i++)
732
                {
733
                        int u=chrSrc[i]>>7;
734
                        int v=chrSrc[i + 2048]>>7;
735

    
736
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
737
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
738
                }
739
#endif
740
}
741

    
742

    
743
/**
744
 * vertical scale YV12 to RGB
745
 */
746
static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
747
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
748
                            uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
749
{
750
/*        if(flags&SWS_FULL_UV_IPOL)
751
        {
752
//FIXME
753
        }//FULL_UV_IPOL
754
        else*/
755
        {
756
#ifdef HAVE_MMX
757
                if(dstFormat == IMGFMT_BGR32) //FIXME untested
758
                {
759
                        asm volatile(
760
                                YSCALEYUV2RGBX
761
                                WRITEBGR32
762

    
763
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
764
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
765
                           "r" (dest), "m" (dstW),
766
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
767
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
768
                        );
769
                }
770
                else if(dstFormat == IMGFMT_BGR24) //FIXME untested
771
                {
772
                        asm volatile(
773
                                YSCALEYUV2RGBX
774
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t" //FIXME optimize
775
                                "addl %4, %%ebx                        \n\t"
776
                                WRITEBGR24
777

    
778
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
779
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
780
                           "r" (dest), "m" (dstW),
781
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
782
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
783
                        );
784
                }
785
                else if(dstFormat==IMGFMT_BGR15)
786
                {
787
                        asm volatile(
788
                                YSCALEYUV2RGBX
789
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
790
#ifdef DITHER1XBPP
791
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
792
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
793
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
794
#endif
795

    
796
                                WRITEBGR15
797

    
798
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
799
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
800
                           "r" (dest), "m" (dstW),
801
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
802
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
803
                        );
804
                }
805
                else if(dstFormat==IMGFMT_BGR16)
806
                {
807
                        asm volatile(
808
                                YSCALEYUV2RGBX
809
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
810
#ifdef DITHER1XBPP
811
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
812
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
813
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
814
#endif
815

    
816
                                WRITEBGR16
817

    
818
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
819
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
820
                           "r" (dest), "m" (dstW),
821
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
822
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
823
                        );
824
                }
825
#else
826
yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
827
            chrFilter, chrSrc, chrFilterSize,
828
            dest, dstW, dstFormat);
829

    
830
#endif
831
        } //!FULL_UV_IPOL
832
}
833

    
834

    
835
/**
836
 * vertical bilinear scale YV12 to RGB
837
 */
838
static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
839
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
840
{
841
        int yalpha1=yalpha^4095;
842
        int uvalpha1=uvalpha^4095;
843

    
844
        if(flags&SWS_FULL_CHR_H_INT)
845
        {
846

    
847
#ifdef HAVE_MMX
848
                if(dstFormat==IMGFMT_BGR32)
849
                {
850
                        asm volatile(
851

    
852

    
853
FULL_YSCALEYUV2RGB
854
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
855
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
856

    
857
                        "movq %%mm3, %%mm1                \n\t"
858
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
859
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
860

    
861
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
862
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
863

    
864
                        "addl $4, %%eax                        \n\t"
865
                        "cmpl %5, %%eax                        \n\t"
866
                        " jb 1b                                \n\t"
867

    
868

    
869
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
870
                        "m" (yalpha1), "m" (uvalpha1)
871
                        : "%eax"
872
                        );
873
                }
874
                else if(dstFormat==IMGFMT_BGR24)
875
                {
876
                        asm volatile(
877

    
878
FULL_YSCALEYUV2RGB
879

    
880
                                                                // lsb ... msb
881
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
882
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
883

    
884
                        "movq %%mm3, %%mm1                \n\t"
885
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
886
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
887

    
888
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
889
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
890
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
891
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
892
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
893
                        "movq %%mm1, %%mm2                \n\t"
894
                        "psllq $48, %%mm1                \n\t" // 000000BG
895
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
896

    
897
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
898
                        "psrld $16, %%mm2                \n\t" // R000R000
899
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
900
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
901

    
902
                        "movl %4, %%ebx                        \n\t"
903
                        "addl %%eax, %%ebx                \n\t"
904

    
905
#ifdef HAVE_MMX2
906
                        //FIXME Alignment
907
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
908
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
909
#else
910
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
911
                        "psrlq $32, %%mm3                \n\t"
912
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
913
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
914
#endif
915
                        "addl $4, %%eax                        \n\t"
916
                        "cmpl %5, %%eax                        \n\t"
917
                        " jb 1b                                \n\t"
918

    
919
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
920
                        "m" (yalpha1), "m" (uvalpha1)
921
                        : "%eax", "%ebx"
922
                        );
923
                }
924
                else if(dstFormat==IMGFMT_BGR15)
925
                {
926
                        asm volatile(
927

    
928
FULL_YSCALEYUV2RGB
929
#ifdef DITHER1XBPP
930
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
931
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
932
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
933
#endif
934
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
935
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
936
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
937

    
938
                        "psrlw $3, %%mm3                \n\t"
939
                        "psllw $2, %%mm1                \n\t"
940
                        "psllw $7, %%mm0                \n\t"
941
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
942
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
943

    
944
                        "por %%mm3, %%mm1                \n\t"
945
                        "por %%mm1, %%mm0                \n\t"
946

    
947
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
948

    
949
                        "addl $4, %%eax                        \n\t"
950
                        "cmpl %5, %%eax                        \n\t"
951
                        " jb 1b                                \n\t"
952

    
953
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
954
                        "m" (yalpha1), "m" (uvalpha1)
955
                        : "%eax"
956
                        );
957
                }
958
                else if(dstFormat==IMGFMT_BGR16)
959
                {
960
                        asm volatile(
961

    
962
FULL_YSCALEYUV2RGB
963
#ifdef DITHER1XBPP
964
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
965
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
966
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
967
#endif
968
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
969
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
970
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
971

    
972
                        "psrlw $3, %%mm3                \n\t"
973
                        "psllw $3, %%mm1                \n\t"
974
                        "psllw $8, %%mm0                \n\t"
975
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
976
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
977

    
978
                        "por %%mm3, %%mm1                \n\t"
979
                        "por %%mm1, %%mm0                \n\t"
980

    
981
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
982

    
983
                        "addl $4, %%eax                        \n\t"
984
                        "cmpl %5, %%eax                        \n\t"
985
                        " jb 1b                                \n\t"
986

    
987
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
988
                        "m" (yalpha1), "m" (uvalpha1)
989
                        : "%eax"
990
                        );
991
                }
992
#else
993
                if(dstFormat==IMGFMT_BGR32)
994
                {
995
                        int i;
996
                        for(i=0;i<dstW;i++){
997
                                // vertical linear interpolation && yuv2rgb in a single step:
998
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
999
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1000
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1001
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1002
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1003
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1004
                                dest+= 4;
1005
                        }
1006
                }
1007
                else if(dstFormat==IMGFMT_BGR24)
1008
                {
1009
                        int i;
1010
                        for(i=0;i<dstW;i++){
1011
                                // vertical linear interpolation && yuv2rgb in a single step:
1012
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1013
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1014
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1015
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1016
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1017
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1018
                                dest+= 3;
1019
                        }
1020
                }
1021
                else if(dstFormat==IMGFMT_BGR16)
1022
                {
1023
                        int i;
1024
                        for(i=0;i<dstW;i++){
1025
                                // vertical linear interpolation && yuv2rgb in a single step:
1026
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1027
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1028
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1029

    
1030
                                ((uint16_t*)dest)[i] =
1031
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1032
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1033
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1034
                        }
1035
                }
1036
                else if(dstFormat==IMGFMT_BGR15)
1037
                {
1038
                        int i;
1039
                        for(i=0;i<dstW;i++){
1040
                                // vertical linear interpolation && yuv2rgb in a single step:
1041
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1042
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1043
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1044

    
1045
                                ((uint16_t*)dest)[i] =
1046
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1047
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1048
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1049
                        }
1050
                }
1051
#endif
1052
        }//FULL_UV_IPOL
1053
        else
1054
        {
1055
#ifdef HAVE_MMX
1056
                if(dstFormat==IMGFMT_BGR32)
1057
                {
1058
                        asm volatile(
1059
                                YSCALEYUV2RGB
1060
                                WRITEBGR32
1061

    
1062
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1063
                        "m" (yalpha1), "m" (uvalpha1)
1064
                        : "%eax"
1065
                        );
1066
                }
1067
                else if(dstFormat==IMGFMT_BGR24)
1068
                {
1069
                        asm volatile(
1070
                                "movl %4, %%ebx                        \n\t"
1071
                                YSCALEYUV2RGB
1072
                                WRITEBGR24
1073

    
1074
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1075
                        "m" (yalpha1), "m" (uvalpha1)
1076
                        : "%eax", "%ebx"
1077
                        );
1078
                }
1079
                else if(dstFormat==IMGFMT_BGR15)
1080
                {
1081
                        asm volatile(
1082
                                YSCALEYUV2RGB
1083
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084
#ifdef DITHER1XBPP
1085
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1086
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1087
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1088
#endif
1089

    
1090
                                WRITEBGR15
1091

    
1092
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1093
                        "m" (yalpha1), "m" (uvalpha1)
1094
                        : "%eax"
1095
                        );
1096
                }
1097
                else if(dstFormat==IMGFMT_BGR16)
1098
                {
1099
                        asm volatile(
1100
                                YSCALEYUV2RGB
1101
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102
#ifdef DITHER1XBPP
1103
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1106
#endif
1107

    
1108
                                WRITEBGR16
1109

    
1110
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1111
                        "m" (yalpha1), "m" (uvalpha1)
1112
                        : "%eax"
1113
                        );
1114
                }
1115
#else
1116
                if(dstFormat==IMGFMT_BGR32)
1117
                {
1118
                        int i;
1119
                        for(i=0; i<dstW-1; i+=2){
1120
                                // vertical linear interpolation && yuv2rgb in a single step:
1121
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1122
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1123
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1124
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1125

    
1126
                                int Cb= yuvtab_40cf[U];
1127
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1128
                                int Cr= yuvtab_3343[V];
1129

    
1130
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1131
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1132
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1133

    
1134
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1135
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1136
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1137
                        }
1138
                }
1139
                else if(dstFormat==IMGFMT_BGR24)
1140
                {
1141
                        int i;
1142
                        for(i=0; i<dstW-1; i+=2){
1143
                                // vertical linear interpolation && yuv2rgb in a single step:
1144
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1145
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1146
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1147
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1148

    
1149
                                int Cb= yuvtab_40cf[U];
1150
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1151
                                int Cr= yuvtab_3343[V];
1152

    
1153
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
1154
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
1155
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
1156

    
1157
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
1158
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
1159
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
1160
                                dest+=6;
1161
                        }
1162
                }
1163
                else if(dstFormat==IMGFMT_BGR16)
1164
                {
1165
                        int i;
1166
#ifdef DITHER1XBPP
1167
                        static int ditherb1=1<<14;
1168
                        static int ditherg1=1<<13;
1169
                        static int ditherr1=2<<14;
1170
                        static int ditherb2=3<<14;
1171
                        static int ditherg2=3<<13;
1172
                        static int ditherr2=0<<14;
1173

    
1174
                        ditherb1 ^= (1^2)<<14;
1175
                        ditherg1 ^= (1^2)<<13;
1176
                        ditherr1 ^= (1^2)<<14;
1177
                        ditherb2 ^= (3^0)<<14;
1178
                        ditherg2 ^= (3^0)<<13;
1179
                        ditherr2 ^= (3^0)<<14;
1180
#else
1181
                        const int ditherb1=0;
1182
                        const int ditherg1=0;
1183
                        const int ditherr1=0;
1184
                        const int ditherb2=0;
1185
                        const int ditherg2=0;
1186
                        const int ditherr2=0;
1187
#endif
1188
                        for(i=0; i<dstW-1; i+=2){
1189
                                // vertical linear interpolation && yuv2rgb in a single step:
1190
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1191
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1192
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1193
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1194

    
1195
                                int Cb= yuvtab_40cf[U];
1196
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1197
                                int Cr= yuvtab_3343[V];
1198

    
1199
                                ((uint16_t*)dest)[i] =
1200
                                        clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1201
                                        clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1202
                                        clip_table16r[(Y1 + Cr + ditherr1) >>13];
1203

    
1204
                                ((uint16_t*)dest)[i+1] =
1205
                                        clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1206
                                        clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1207
                                        clip_table16r[(Y2 + Cr + ditherr2) >>13];
1208
                        }
1209
                }
1210
                else if(dstFormat==IMGFMT_BGR15)
1211
                {
1212
                        int i;
1213
#ifdef DITHER1XBPP
1214
                        static int ditherb1=1<<14;
1215
                        static int ditherg1=1<<14;
1216
                        static int ditherr1=2<<14;
1217
                        static int ditherb2=3<<14;
1218
                        static int ditherg2=3<<14;
1219
                        static int ditherr2=0<<14;
1220

    
1221
                        ditherb1 ^= (1^2)<<14;
1222
                        ditherg1 ^= (1^2)<<14;
1223
                        ditherr1 ^= (1^2)<<14;
1224
                        ditherb2 ^= (3^0)<<14;
1225
                        ditherg2 ^= (3^0)<<14;
1226
                        ditherr2 ^= (3^0)<<14;
1227
#else
1228
                        const int ditherb1=0;
1229
                        const int ditherg1=0;
1230
                        const int ditherr1=0;
1231
                        const int ditherb2=0;
1232
                        const int ditherg2=0;
1233
                        const int ditherr2=0;
1234
#endif
1235
                        for(i=0; i<dstW-1; i+=2){
1236
                                // vertical linear interpolation && yuv2rgb in a single step:
1237
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1238
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1239
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1240
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1241

    
1242
                                int Cb= yuvtab_40cf[U];
1243
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1244
                                int Cr= yuvtab_3343[V];
1245

    
1246
                                ((uint16_t*)dest)[i] =
1247
                                        clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1248
                                        clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1249
                                        clip_table15r[(Y1 + Cr + ditherr1) >>13];
1250

    
1251
                                ((uint16_t*)dest)[i+1] =
1252
                                        clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1253
                                        clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1254
                                        clip_table15r[(Y2 + Cr + ditherr2) >>13];
1255
                        }
1256
                }
1257
#endif
1258
        } //!FULL_UV_IPOL
1259
}
1260

    
1261
/**
1262
 * YV12 to RGB without scaling or interpolating
1263
 */
1264
static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1265
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
1266
{
1267
        int uvalpha1=uvalpha^4095;
1268
        const int yalpha1=0;
1269

    
1270
        if(flags&SWS_FULL_CHR_H_INT)
1271
        {
1272
                RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
1273
                return;
1274
        }
1275

    
1276
#ifdef HAVE_MMX
1277
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1278
        {
1279
                if(dstFormat==IMGFMT_BGR32)
1280
                {
1281
                        asm volatile(
1282
                                YSCALEYUV2RGB1
1283
                                WRITEBGR32
1284
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1285
                        "m" (yalpha1), "m" (uvalpha1)
1286
                        : "%eax"
1287
                        );
1288
                }
1289
                else if(dstFormat==IMGFMT_BGR24)
1290
                {
1291
                        asm volatile(
1292
                                "movl %4, %%ebx                        \n\t"
1293
                                YSCALEYUV2RGB1
1294
                                WRITEBGR24
1295
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1296
                        "m" (yalpha1), "m" (uvalpha1)
1297
                        : "%eax", "%ebx"
1298
                        );
1299
                }
1300
                else if(dstFormat==IMGFMT_BGR15)
1301
                {
1302
                        asm volatile(
1303
                                YSCALEYUV2RGB1
1304
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1305
#ifdef DITHER1XBPP
1306
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1307
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1308
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1309
#endif
1310
                                WRITEBGR15
1311
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1312
                        "m" (yalpha1), "m" (uvalpha1)
1313
                        : "%eax"
1314
                        );
1315
                }
1316
                else if(dstFormat==IMGFMT_BGR16)
1317
                {
1318
                        asm volatile(
1319
                                YSCALEYUV2RGB1
1320
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321
#ifdef DITHER1XBPP
1322
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1323
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1324
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1325
#endif
1326

    
1327
                                WRITEBGR16
1328
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1329
                        "m" (yalpha1), "m" (uvalpha1)
1330
                        : "%eax"
1331
                        );
1332
                }
1333
        }
1334
        else
1335
        {
1336
                if(dstFormat==IMGFMT_BGR32)
1337
                {
1338
                        asm volatile(
1339
                                YSCALEYUV2RGB1b
1340
                                WRITEBGR32
1341
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1342
                        "m" (yalpha1), "m" (uvalpha1)
1343
                        : "%eax"
1344
                        );
1345
                }
1346
                else if(dstFormat==IMGFMT_BGR24)
1347
                {
1348
                        asm volatile(
1349
                                "movl %4, %%ebx                        \n\t"
1350
                                YSCALEYUV2RGB1b
1351
                                WRITEBGR24
1352
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1353
                        "m" (yalpha1), "m" (uvalpha1)
1354
                        : "%eax", "%ebx"
1355
                        );
1356
                }
1357
                else if(dstFormat==IMGFMT_BGR15)
1358
                {
1359
                        asm volatile(
1360
                                YSCALEYUV2RGB1b
1361
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1362
#ifdef DITHER1XBPP
1363
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1364
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1365
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1366
#endif
1367
                                WRITEBGR15
1368
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1369
                        "m" (yalpha1), "m" (uvalpha1)
1370
                        : "%eax"
1371
                        );
1372
                }
1373
                else if(dstFormat==IMGFMT_BGR16)
1374
                {
1375
                        asm volatile(
1376
                                YSCALEYUV2RGB1b
1377
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1378
#ifdef DITHER1XBPP
1379
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1380
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1381
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1382
#endif
1383

    
1384
                                WRITEBGR16
1385
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1386
                        "m" (yalpha1), "m" (uvalpha1)
1387
                        : "%eax"
1388
                        );
1389
                }
1390
        }
1391
#else
1392
//FIXME write 2 versions (for even & odd lines)
1393

    
1394
        if(dstFormat==IMGFMT_BGR32)
1395
        {
1396
                int i;
1397
                for(i=0; i<dstW-1; i+=2){
1398
                        // vertical linear interpolation && yuv2rgb in a single step:
1399
                        int Y1=yuvtab_2568[buf0[i]>>7];
1400
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1401
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1402
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1403

    
1404
                        int Cb= yuvtab_40cf[U];
1405
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1406
                        int Cr= yuvtab_3343[V];
1407

    
1408
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1409
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1410
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1411

    
1412
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1413
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1414
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1415
                }
1416
        }
1417
        else if(dstFormat==IMGFMT_BGR24)
1418
        {
1419
                int i;
1420
                for(i=0; i<dstW-1; i+=2){
1421
                        // vertical linear interpolation && yuv2rgb in a single step:
1422
                        int Y1=yuvtab_2568[buf0[i]>>7];
1423
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1424
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1425
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1426

    
1427
                        int Cb= yuvtab_40cf[U];
1428
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1429
                        int Cr= yuvtab_3343[V];
1430

    
1431
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1432
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1433
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1434

    
1435
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1436
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1437
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1438
                        dest+=6;
1439
                }
1440
        }
1441
        else if(dstFormat==IMGFMT_BGR16)
1442
        {
1443
                int i;
1444
#ifdef DITHER1XBPP
1445
                static int ditherb1=1<<14;
1446
                static int ditherg1=1<<13;
1447
                static int ditherr1=2<<14;
1448
                static int ditherb2=3<<14;
1449
                static int ditherg2=3<<13;
1450
                static int ditherr2=0<<14;
1451

    
1452
                ditherb1 ^= (1^2)<<14;
1453
                ditherg1 ^= (1^2)<<13;
1454
                ditherr1 ^= (1^2)<<14;
1455
                ditherb2 ^= (3^0)<<14;
1456
                ditherg2 ^= (3^0)<<13;
1457
                ditherr2 ^= (3^0)<<14;
1458
#else
1459
                const int ditherb1=0;
1460
                const int ditherg1=0;
1461
                const int ditherr1=0;
1462
                const int ditherb2=0;
1463
                const int ditherg2=0;
1464
                const int ditherr2=0;
1465
#endif
1466
                for(i=0; i<dstW-1; i+=2){
1467
                        // vertical linear interpolation && yuv2rgb in a single step:
1468
                        int Y1=yuvtab_2568[buf0[i]>>7];
1469
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1470
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1471
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1472

    
1473
                        int Cb= yuvtab_40cf[U];
1474
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1475
                        int Cr= yuvtab_3343[V];
1476

    
1477
                        ((uint16_t*)dest)[i] =
1478
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1479
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1480
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
1481

    
1482
                        ((uint16_t*)dest)[i+1] =
1483
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1484
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1485
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
1486
                }
1487
        }
1488
        else if(dstFormat==IMGFMT_BGR15)
1489
        {
1490
                int i;
1491
#ifdef DITHER1XBPP
1492
                static int ditherb1=1<<14;
1493
                static int ditherg1=1<<14;
1494
                static int ditherr1=2<<14;
1495
                static int ditherb2=3<<14;
1496
                static int ditherg2=3<<14;
1497
                static int ditherr2=0<<14;
1498

    
1499
                ditherb1 ^= (1^2)<<14;
1500
                ditherg1 ^= (1^2)<<14;
1501
                ditherr1 ^= (1^2)<<14;
1502
                ditherb2 ^= (3^0)<<14;
1503
                ditherg2 ^= (3^0)<<14;
1504
                ditherr2 ^= (3^0)<<14;
1505
#else
1506
                const int ditherb1=0;
1507
                const int ditherg1=0;
1508
                const int ditherr1=0;
1509
                const int ditherb2=0;
1510
                const int ditherg2=0;
1511
                const int ditherr2=0;
1512
#endif
1513
                for(i=0; i<dstW-1; i+=2){
1514
                        // vertical linear interpolation && yuv2rgb in a single step:
1515
                        int Y1=yuvtab_2568[buf0[i]>>7];
1516
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1517
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1518
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1519

    
1520
                        int Cb= yuvtab_40cf[U];
1521
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1522
                        int Cr= yuvtab_3343[V];
1523

    
1524
                        ((uint16_t*)dest)[i] =
1525
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1526
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1527
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
1528

    
1529
                        ((uint16_t*)dest)[i+1] =
1530
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1531
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1532
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
1533
                }
1534
        }
1535
#endif
1536
}
1537

    
1538
//FIXME yuy2* can read upto 7 samples to much
1539

    
1540
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1541
{
1542
#ifdef HAVE_MMX
1543
        asm volatile(
1544
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1545
                "movl %0, %%eax                        \n\t"
1546
                "1:                                \n\t"
1547
                "movq (%1, %%eax,2), %%mm0        \n\t"
1548
                "movq 8(%1, %%eax,2), %%mm1        \n\t"
1549
                "pand %%mm2, %%mm0                \n\t"
1550
                "pand %%mm2, %%mm1                \n\t"
1551
                "packuswb %%mm1, %%mm0                \n\t"
1552
                "movq %%mm0, (%2, %%eax)        \n\t"
1553
                "addl $8, %%eax                        \n\t"
1554
                " js 1b                                \n\t"
1555
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1556
                : "%eax"
1557
        );
1558
#else
1559
        int i;
1560
        for(i=0; i<width; i++)
1561
                dst[i]= src[2*i];
1562
#endif
1563
}
1564

    
1565
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1566
{
1567
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1568
        asm volatile(
1569
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1570
                "movl %0, %%eax                        \n\t"
1571
                "1:                                \n\t"
1572
                "movq (%1, %%eax,4), %%mm0        \n\t"
1573
                "movq 8(%1, %%eax,4), %%mm1        \n\t"
1574
                "movq (%2, %%eax,4), %%mm2        \n\t"
1575
                "movq 8(%2, %%eax,4), %%mm3        \n\t"
1576
                PAVGB(%%mm2, %%mm0)
1577
                PAVGB(%%mm3, %%mm1)
1578
                "psrlw $8, %%mm0                \n\t"
1579
                "psrlw $8, %%mm1                \n\t"
1580
                "packuswb %%mm1, %%mm0                \n\t"
1581
                "movq %%mm0, %%mm1                \n\t"
1582
                "psrlw $8, %%mm0                \n\t"
1583
                "pand %%mm4, %%mm1                \n\t"
1584
                "packuswb %%mm0, %%mm0                \n\t"
1585
                "packuswb %%mm1, %%mm1                \n\t"
1586
                "movd %%mm0, (%4, %%eax)        \n\t"
1587
                "movd %%mm1, (%3, %%eax)        \n\t"
1588
                "addl $4, %%eax                        \n\t"
1589
                " js 1b                                \n\t"
1590
                : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1591
                : "%eax"
1592
        );
1593
#else
1594
        int i;
1595
        for(i=0; i<width; i++)
1596
        {
1597
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1598
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1599
        }
1600
#endif
1601
}
1602

    
1603
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1604
{
1605
#ifdef HAVE_MMXFIXME
1606
#else
1607
        int i;
1608
        for(i=0; i<width; i++)
1609
        {
1610
                int b= src[i*4+0];
1611
                int g= src[i*4+1];
1612
                int r= src[i*4+2];
1613

    
1614
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1615
        }
1616
#endif
1617
}
1618

    
1619
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1620
{
1621
#ifdef HAVE_MMXFIXME
1622
#else
1623
        int i;
1624
        for(i=0; i<width; i++)
1625
        {
1626
                int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1627
                int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1628
                int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1629

    
1630
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1631
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1632
        }
1633
#endif
1634
}
1635

    
1636
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1637
{
1638
#ifdef HAVE_MMX
1639
        asm volatile(
1640
                "movl %2, %%eax                        \n\t"
1641
                "movq bgr2YCoeff, %%mm6                \n\t"
1642
                "movq w1111, %%mm5                \n\t"
1643
                "pxor %%mm7, %%mm7                \n\t"
1644
                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1645
                ".balign 16                        \n\t"
1646
                "1:                                \n\t"
1647
                PREFETCH" 64(%0, %%ebx)                \n\t"
1648
                "movd (%0, %%ebx), %%mm0        \n\t"
1649
                "movd 3(%0, %%ebx), %%mm1        \n\t"
1650
                "punpcklbw %%mm7, %%mm0                \n\t"
1651
                "punpcklbw %%mm7, %%mm1                \n\t"
1652
                "movd 6(%0, %%ebx), %%mm2        \n\t"
1653
                "movd 9(%0, %%ebx), %%mm3        \n\t"
1654
                "punpcklbw %%mm7, %%mm2                \n\t"
1655
                "punpcklbw %%mm7, %%mm3                \n\t"
1656
                "pmaddwd %%mm6, %%mm0                \n\t"
1657
                "pmaddwd %%mm6, %%mm1                \n\t"
1658
                "pmaddwd %%mm6, %%mm2                \n\t"
1659
                "pmaddwd %%mm6, %%mm3                \n\t"
1660
#ifndef FAST_BGR2YV12
1661
                "psrad $8, %%mm0                \n\t"
1662
                "psrad $8, %%mm1                \n\t"
1663
                "psrad $8, %%mm2                \n\t"
1664
                "psrad $8, %%mm3                \n\t"
1665
#endif
1666
                "packssdw %%mm1, %%mm0                \n\t"
1667
                "packssdw %%mm3, %%mm2                \n\t"
1668
                "pmaddwd %%mm5, %%mm0                \n\t"
1669
                "pmaddwd %%mm5, %%mm2                \n\t"
1670
                "packssdw %%mm2, %%mm0                \n\t"
1671
                "psraw $7, %%mm0                \n\t"
1672

    
1673
                "movd 12(%0, %%ebx), %%mm4        \n\t"
1674
                "movd 15(%0, %%ebx), %%mm1        \n\t"
1675
                "punpcklbw %%mm7, %%mm4                \n\t"
1676
                "punpcklbw %%mm7, %%mm1                \n\t"
1677
                "movd 18(%0, %%ebx), %%mm2        \n\t"
1678
                "movd 21(%0, %%ebx), %%mm3        \n\t"
1679
                "punpcklbw %%mm7, %%mm2                \n\t"
1680
                "punpcklbw %%mm7, %%mm3                \n\t"
1681
                "pmaddwd %%mm6, %%mm4                \n\t"
1682
                "pmaddwd %%mm6, %%mm1                \n\t"
1683
                "pmaddwd %%mm6, %%mm2                \n\t"
1684
                "pmaddwd %%mm6, %%mm3                \n\t"
1685
#ifndef FAST_BGR2YV12
1686
                "psrad $8, %%mm4                \n\t"
1687
                "psrad $8, %%mm1                \n\t"
1688
                "psrad $8, %%mm2                \n\t"
1689
                "psrad $8, %%mm3                \n\t"
1690
#endif
1691
                "packssdw %%mm1, %%mm4                \n\t"
1692
                "packssdw %%mm3, %%mm2                \n\t"
1693
                "pmaddwd %%mm5, %%mm4                \n\t"
1694
                "pmaddwd %%mm5, %%mm2                \n\t"
1695
                "addl $24, %%ebx                \n\t"
1696
                "packssdw %%mm2, %%mm4                \n\t"
1697
                "psraw $7, %%mm4                \n\t"
1698

    
1699
                "packuswb %%mm4, %%mm0                \n\t"
1700
                "paddusb bgr2YOffset, %%mm0        \n\t"
1701

    
1702
                MOVNTQ(%%mm0, (%1, %%eax))
1703
                "addl $8, %%eax                        \n\t"
1704
                " js 1b                                \n\t"
1705
                : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1706
                : "%eax", "%ebx"
1707
        );
1708
#else
1709
        int i;
1710
        for(i=0; i<width; i++)
1711
        {
1712
                int b= src[i*3+0];
1713
                int g= src[i*3+1];
1714
                int r= src[i*3+2];
1715

    
1716
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1717
        }
1718
#endif
1719
}
1720

    
1721
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1722
{
1723
#ifdef HAVE_MMXFIXME
1724
#else
1725
        int i;
1726
        for(i=0; i<width; i++)
1727
        {
1728
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1729
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1730
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1731

    
1732
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1733
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1734
        }
1735
#endif
1736
}
1737

    
1738
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1739
{
1740
        int i;
1741
        for(i=0; i<width; i++)
1742
        {
1743
                int d= src[i*2] + (src[i*2+1]<<8);
1744
                int b= d&0x1F;
1745
                int g= (d>>5)&0x3F;
1746
                int r= (d>>11)&0x1F;
1747

    
1748
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1749
        }
1750
}
1751

    
1752
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1753
{
1754
        int i;
1755
        for(i=0; i<width; i++)
1756
        {
1757
#if 1
1758
                int d0= le2me_32( ((uint32_t*)src1)[i] );
1759
                int d1= le2me_32( ((uint32_t*)src2)[i] );
1760
                
1761
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1762
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1763

    
1764
                int dh2= (dh>>11) + (dh<<21);
1765
                int d= dh2 + dl;
1766

    
1767
                int b= d&0x7F;
1768
                int r= (d>>11)&0x7F;
1769
                int g= d>>21;
1770
#else
1771
                int d0= src1[i*4] + (src1[i*4+1]<<8);
1772
                int b0= d0&0x1F;
1773
                int g0= (d0>>5)&0x3F;
1774
                int r0= (d0>>11)&0x1F;
1775

    
1776
                int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1777
                int b1= d1&0x1F;
1778
                int g1= (d1>>5)&0x3F;
1779
                int r1= (d1>>11)&0x1F;
1780

    
1781
                int d2= src2[i*4] + (src2[i*4+1]<<8);
1782
                int b2= d2&0x1F;
1783
                int g2= (d2>>5)&0x3F;
1784
                int r2= (d2>>11)&0x1F;
1785

    
1786
                int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1787
                int b3= d3&0x1F;
1788
                int g3= (d3>>5)&0x3F;
1789
                int r3= (d3>>11)&0x1F;
1790

    
1791
                int b= b0 + b1 + b2 + b3;
1792
                int g= g0 + g1 + g2 + g3;
1793
                int r= r0 + r1 + r2 + r3;
1794
#endif
1795
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1796
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1797
        }
1798
}
1799

    
1800
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1801
{
1802
        int i;
1803
        for(i=0; i<width; i++)
1804
        {
1805
                int d= src[i*2] + (src[i*2+1]<<8);
1806
                int b= d&0x1F;
1807
                int g= (d>>5)&0x1F;
1808
                int r= (d>>10)&0x1F;
1809

    
1810
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1811
        }
1812
}
1813

    
1814
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1815
{
1816
        int i;
1817
        for(i=0; i<width; i++)
1818
        {
1819
#if 1
1820
                int d0= le2me_32( ((uint32_t*)src1)[i] );
1821
                int d1= le2me_32( ((uint32_t*)src2)[i] );
1822
                
1823
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1824
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1825

    
1826
                int dh2= (dh>>11) + (dh<<21);
1827
                int d= dh2 + dl;
1828

    
1829
                int b= d&0x7F;
1830
                int r= (d>>10)&0x7F;
1831
                int g= d>>21;
1832
#else
1833
                int d0= src1[i*4] + (src1[i*4+1]<<8);
1834
                int b0= d0&0x1F;
1835
                int g0= (d0>>5)&0x1F;
1836
                int r0= (d0>>10)&0x1F;
1837

    
1838
                int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1839
                int b1= d1&0x1F;
1840
                int g1= (d1>>5)&0x1F;
1841
                int r1= (d1>>10)&0x1F;
1842

    
1843
                int d2= src2[i*4] + (src2[i*4+1]<<8);
1844
                int b2= d2&0x1F;
1845
                int g2= (d2>>5)&0x1F;
1846
                int r2= (d2>>10)&0x1F;
1847

    
1848
                int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1849
                int b3= d3&0x1F;
1850
                int g3= (d3>>5)&0x1F;
1851
                int r3= (d3>>10)&0x1F;
1852

    
1853
                int b= b0 + b1 + b2 + b3;
1854
                int g= g0 + g1 + g2 + g3;
1855
                int r= r0 + r1 + r2 + r3;
1856
#endif
1857
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1858
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
1859
        }
1860
}
1861

    
1862

    
1863
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
1864
{
1865
        int i;
1866
        for(i=0; i<width; i++)
1867
        {
1868
                int r= src[i*4+0];
1869
                int g= src[i*4+1];
1870
                int b= src[i*4+2];
1871

    
1872
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1873
        }
1874
}
1875

    
1876
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1877
{
1878
        int i;
1879
        for(i=0; i<width; i++)
1880
        {
1881
                int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1882
                int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1883
                int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1884

    
1885
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1886
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1887
        }
1888
}
1889

    
1890
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
1891
{
1892
        int i;
1893
        for(i=0; i<width; i++)
1894
        {
1895
                int r= src[i*3+0];
1896
                int g= src[i*3+1];
1897
                int b= src[i*3+2];
1898

    
1899
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1900
        }
1901
}
1902

    
1903
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1904
{
1905
        int i;
1906
        for(i=0; i<width; i++)
1907
        {
1908
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1909
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1910
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1911

    
1912
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1913
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1914
        }
1915
}
1916

    
1917

    
1918
// Bilinear / Bicubic scaling
1919
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1920
                                  int16_t *filter, int16_t *filterPos, int filterSize)
1921
{
1922
#ifdef HAVE_MMX
1923
        if(filterSize==4) // allways true for upscaling, sometimes for down too
1924
        {
1925
                int counter= -2*dstW;
1926
                filter-= counter*2;
1927
                filterPos-= counter/2;
1928
                dst-= counter/2;
1929
                asm volatile(
1930
                        "pxor %%mm7, %%mm7                \n\t"
1931
                        "movq "MANGLE(w02)", %%mm6        \n\t"
1932
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
1933
                        "movl %%eax, %%ebp                \n\t"
1934
                        ".balign 16                        \n\t"
1935
                        "1:                                \n\t"
1936
                        "movzwl (%2, %%ebp), %%eax        \n\t"
1937
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
1938
                        "movq (%1, %%ebp, 4), %%mm1        \n\t"
1939
                        "movq 8(%1, %%ebp, 4), %%mm3        \n\t"
1940
                        "movd (%3, %%eax), %%mm0        \n\t"
1941
                        "movd (%3, %%ebx), %%mm2        \n\t"
1942
                        "punpcklbw %%mm7, %%mm0                \n\t"
1943
                        "punpcklbw %%mm7, %%mm2                \n\t"
1944
                        "pmaddwd %%mm1, %%mm0                \n\t"
1945
                        "pmaddwd %%mm2, %%mm3                \n\t"
1946
                        "psrad $8, %%mm0                \n\t"
1947
                        "psrad $8, %%mm3                \n\t"
1948
                        "packssdw %%mm3, %%mm0                \n\t"
1949
                        "pmaddwd %%mm6, %%mm0                \n\t"
1950
                        "packssdw %%mm0, %%mm0                \n\t"
1951
                        "movd %%mm0, (%4, %%ebp)        \n\t"
1952
                        "addl $4, %%ebp                        \n\t"
1953
                        " jnc 1b                        \n\t"
1954

    
1955
                        "popl %%ebp                        \n\t"
1956
                        : "+a" (counter)
1957
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1958
                        : "%ebx"
1959
                );
1960
        }
1961
        else if(filterSize==8)
1962
        {
1963
                int counter= -2*dstW;
1964
                filter-= counter*4;
1965
                filterPos-= counter/2;
1966
                dst-= counter/2;
1967
                asm volatile(
1968
                        "pxor %%mm7, %%mm7                \n\t"
1969
                        "movq "MANGLE(w02)", %%mm6        \n\t"
1970
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
1971
                        "movl %%eax, %%ebp                \n\t"
1972
                        ".balign 16                        \n\t"
1973
                        "1:                                \n\t"
1974
                        "movzwl (%2, %%ebp), %%eax        \n\t"
1975
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
1976
                        "movq (%1, %%ebp, 8), %%mm1        \n\t"
1977
                        "movq 16(%1, %%ebp, 8), %%mm3        \n\t"
1978
                        "movd (%3, %%eax), %%mm0        \n\t"
1979
                        "movd (%3, %%ebx), %%mm2        \n\t"
1980
                        "punpcklbw %%mm7, %%mm0                \n\t"
1981
                        "punpcklbw %%mm7, %%mm2                \n\t"
1982
                        "pmaddwd %%mm1, %%mm0                \n\t"
1983
                        "pmaddwd %%mm2, %%mm3                \n\t"
1984

    
1985
                        "movq 8(%1, %%ebp, 8), %%mm1        \n\t"
1986
                        "movq 24(%1, %%ebp, 8), %%mm5        \n\t"
1987
                        "movd 4(%3, %%eax), %%mm4        \n\t"
1988
                        "movd 4(%3, %%ebx), %%mm2        \n\t"
1989
                        "punpcklbw %%mm7, %%mm4                \n\t"
1990
                        "punpcklbw %%mm7, %%mm2                \n\t"
1991
                        "pmaddwd %%mm1, %%mm4                \n\t"
1992
                        "pmaddwd %%mm2, %%mm5                \n\t"
1993
                        "paddd %%mm4, %%mm0                \n\t"
1994
                        "paddd %%mm5, %%mm3                \n\t"
1995
                                                
1996
                        "psrad $8, %%mm0                \n\t"
1997
                        "psrad $8, %%mm3                \n\t"
1998
                        "packssdw %%mm3, %%mm0                \n\t"
1999
                        "pmaddwd %%mm6, %%mm0                \n\t"
2000
                        "packssdw %%mm0, %%mm0                \n\t"
2001
                        "movd %%mm0, (%4, %%ebp)        \n\t"
2002
                        "addl $4, %%ebp                        \n\t"
2003
                        " jnc 1b                        \n\t"
2004

    
2005
                        "popl %%ebp                        \n\t"
2006
                        : "+a" (counter)
2007
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2008
                        : "%ebx"
2009
                );
2010
        }
2011
        else
2012
        {
2013
                int counter= -2*dstW;
2014
//                filter-= counter*filterSize/2;
2015
                filterPos-= counter/2;
2016
                dst-= counter/2;
2017
                asm volatile(
2018
                        "pxor %%mm7, %%mm7                \n\t"
2019
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2020
                        ".balign 16                        \n\t"
2021
                        "1:                                \n\t"
2022
                        "movl %2, %%ecx                        \n\t"
2023
                        "movzwl (%%ecx, %0), %%eax        \n\t"
2024
                        "movzwl 2(%%ecx, %0), %%ebx        \n\t"
2025
                        "movl %5, %%ecx                        \n\t"
2026
                        "pxor %%mm4, %%mm4                \n\t"
2027
                        "pxor %%mm5, %%mm5                \n\t"
2028
                        "2:                                \n\t"
2029
                        "movq (%1), %%mm1                \n\t"
2030
                        "movq (%1, %6), %%mm3                \n\t"
2031
                        "movd (%%ecx, %%eax), %%mm0        \n\t"
2032
                        "movd (%%ecx, %%ebx), %%mm2        \n\t"
2033
                        "punpcklbw %%mm7, %%mm0                \n\t"
2034
                        "punpcklbw %%mm7, %%mm2                \n\t"
2035
                        "pmaddwd %%mm1, %%mm0                \n\t"
2036
                        "pmaddwd %%mm2, %%mm3                \n\t"
2037
                        "paddd %%mm3, %%mm5                \n\t"
2038
                        "paddd %%mm0, %%mm4                \n\t"
2039
                        "addl $8, %1                        \n\t"
2040
                        "addl $4, %%ecx                        \n\t"
2041
                        "cmpl %4, %%ecx                        \n\t"
2042
                        " jb 2b                                \n\t"
2043
                        "addl %6, %1                        \n\t"
2044
                        "psrad $8, %%mm4                \n\t"
2045
                        "psrad $8, %%mm5                \n\t"
2046
                        "packssdw %%mm5, %%mm4                \n\t"
2047
                        "pmaddwd %%mm6, %%mm4                \n\t"
2048
                        "packssdw %%mm4, %%mm4                \n\t"
2049
                        "movl %3, %%eax                        \n\t"
2050
                        "movd %%mm4, (%%eax, %0)        \n\t"
2051
                        "addl $4, %0                        \n\t"
2052
                        " jnc 1b                        \n\t"
2053

    
2054
                        : "+r" (counter), "+r" (filter)
2055
                        : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2056
                          "m" (src), "r" (filterSize*2)
2057
                        : "%ebx", "%eax", "%ecx"
2058
                );
2059
        }
2060
#else
2061
        int i;
2062
        for(i=0; i<dstW; i++)
2063
        {
2064
                int j;
2065
                int srcPos= filterPos[i];
2066
                int val=0;
2067
//                printf("filterPos: %d\n", filterPos[i]);
2068
                for(j=0; j<filterSize; j++)
2069
                {
2070
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2071
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2072
                }
2073
//                filter += hFilterSize;
2074
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2075
//                dst[i] = val>>7;
2076
        }
2077
#endif
2078
}
2079
      // *** horizontal scale Y line to temp buffer
2080
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2081
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2082
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2083
                                   int srcFormat, uint8_t *formatConvBuffer)
2084
{
2085
    if(srcFormat==IMGFMT_YUY2)
2086
    {
2087
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2088
        src= formatConvBuffer;
2089
    }
2090
    else if(srcFormat==IMGFMT_BGR32)
2091
    {
2092
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2093
        src= formatConvBuffer;
2094
    }
2095
    else if(srcFormat==IMGFMT_BGR24)
2096
    {
2097
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2098
        src= formatConvBuffer;
2099
    }
2100
    else if(srcFormat==IMGFMT_BGR16)
2101
    {
2102
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2103
        src= formatConvBuffer;
2104
    }
2105
    else if(srcFormat==IMGFMT_BGR15)
2106
    {
2107
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2108
        src= formatConvBuffer;
2109
    }
2110
    else if(srcFormat==IMGFMT_RGB32)
2111
    {
2112
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2113
        src= formatConvBuffer;
2114
    }
2115
    else if(srcFormat==IMGFMT_RGB24)
2116
    {
2117
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2118
        src= formatConvBuffer;
2119
    }
2120

    
2121
#ifdef HAVE_MMX
2122
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2123
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2124
#else
2125
    if(!(flags&SWS_FAST_BILINEAR))
2126
#endif
2127
    {
2128
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2129
    }
2130
    else // Fast Bilinear upscale / crap downscale
2131
    {
2132
#ifdef ARCH_X86
2133
#ifdef HAVE_MMX2
2134
        int i;
2135
        if(canMMX2BeUsed)
2136
        {
2137
                asm volatile(
2138
                        "pxor %%mm7, %%mm7                \n\t"
2139
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
2140
                        "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
2141
                        "punpcklwd %%mm6, %%mm6                \n\t"
2142
                        "punpcklwd %%mm6, %%mm6                \n\t"
2143
                        "movq %%mm6, %%mm2                \n\t"
2144
                        "psllq $16, %%mm2                \n\t"
2145
                        "paddw %%mm6, %%mm2                \n\t"
2146
                        "psllq $16, %%mm2                \n\t"
2147
                        "paddw %%mm6, %%mm2                \n\t"
2148
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFF
2149
                        "movq %%mm2, %%mm4                \n\t"
2150
                        "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
2151
                        "punpcklwd %%mm6, %%mm6                \n\t"
2152
                        "punpcklwd %%mm6, %%mm6                \n\t"
2153
                        "xorl %%eax, %%eax                \n\t" // i
2154
                        "movl %0, %%esi                        \n\t" // src
2155
                        "movl %1, %%edi                        \n\t" // buf1
2156
                        "movl %3, %%edx                        \n\t" // (xInc*4)>>16
2157
                        "xorl %%ecx, %%ecx                \n\t"
2158
                        "xorl %%ebx, %%ebx                \n\t"
2159
                        "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
2160

    
2161
#define FUNNY_Y_CODE \
2162
                        PREFETCH" 1024(%%esi)                \n\t"\
2163
                        PREFETCH" 1056(%%esi)                \n\t"\
2164
                        PREFETCH" 1088(%%esi)                \n\t"\
2165
                        "call *%6                        \n\t"\
2166
                        "movq %%mm4, %%mm2                \n\t"\
2167
                        "xorl %%ecx, %%ecx                \n\t"
2168

    
2169
FUNNY_Y_CODE
2170
FUNNY_Y_CODE
2171
FUNNY_Y_CODE
2172
FUNNY_Y_CODE
2173
FUNNY_Y_CODE
2174
FUNNY_Y_CODE
2175
FUNNY_Y_CODE
2176
FUNNY_Y_CODE
2177

    
2178
                        :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
2179
                        "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
2180
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2181
                );
2182
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2183
        }
2184
        else
2185
        {
2186
#endif
2187
        //NO MMX just normal asm ...
2188
        asm volatile(
2189
                "xorl %%eax, %%eax                \n\t" // i
2190
                "xorl %%ebx, %%ebx                \n\t" // xx
2191
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2192
                ".balign 16                        \n\t"
2193
                "1:                                \n\t"
2194
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
2195
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
2196
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2197
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2198
                "shll $16, %%edi                \n\t"
2199
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2200
                "movl %1, %%edi                        \n\t"
2201
                "shrl $9, %%esi                        \n\t"
2202
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
2203
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2204
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2205

    
2206
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
2207
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
2208
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2209
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2210
                "shll $16, %%edi                \n\t"
2211
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2212
                "movl %1, %%edi                        \n\t"
2213
                "shrl $9, %%esi                        \n\t"
2214
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
2215
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2216
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2217

    
2218

    
2219
                "addl $2, %%eax                        \n\t"
2220
                "cmpl %2, %%eax                        \n\t"
2221
                " jb 1b                                \n\t"
2222

    
2223

    
2224
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2225
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2226
                );
2227
#ifdef HAVE_MMX2
2228
        } //if MMX2 cant be used
2229
#endif
2230
#else
2231
        int i;
2232
        unsigned int xpos=0;
2233
        for(i=0;i<dstWidth;i++)
2234
        {
2235
                register unsigned int xx=xpos>>16;
2236
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2237
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2238
                xpos+=xInc;
2239
        }
2240
#endif
2241
    }
2242
}
2243

    
2244
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2245
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2246
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2247
                                   int srcFormat, uint8_t *formatConvBuffer)
2248
{
2249
    if(srcFormat==IMGFMT_YUY2)
2250
    {
2251
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2252
        src1= formatConvBuffer;
2253
        src2= formatConvBuffer+2048;
2254
    }
2255
    else if(srcFormat==IMGFMT_BGR32)
2256
    {
2257
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2258
        src1= formatConvBuffer;
2259
        src2= formatConvBuffer+2048;
2260
    }
2261
    else if(srcFormat==IMGFMT_BGR24)
2262
    {
2263
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2264
        src1= formatConvBuffer;
2265
        src2= formatConvBuffer+2048;
2266
    }
2267
    else if(srcFormat==IMGFMT_BGR16)
2268
    {
2269
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2270
        src1= formatConvBuffer;
2271
        src2= formatConvBuffer+2048;
2272
    }
2273
    else if(srcFormat==IMGFMT_BGR15)
2274
    {
2275
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2276
        src1= formatConvBuffer;
2277
        src2= formatConvBuffer+2048;
2278
    }
2279
    else if(srcFormat==IMGFMT_RGB32)
2280
    {
2281
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2282
        src1= formatConvBuffer;
2283
        src2= formatConvBuffer+2048;
2284
    }
2285
    else if(srcFormat==IMGFMT_RGB24)
2286
    {
2287
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2288
        src1= formatConvBuffer;
2289
        src2= formatConvBuffer+2048;
2290
    }
2291
    else if(isGray(srcFormat))
2292
    {
2293
            return;
2294
    }
2295

    
2296
#ifdef HAVE_MMX
2297
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2298
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2299
#else
2300
    if(!(flags&SWS_FAST_BILINEAR))
2301
#endif
2302
    {
2303
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2304
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2305
    }
2306
    else // Fast Bilinear upscale / crap downscale
2307
    {
2308
#ifdef ARCH_X86
2309
#ifdef HAVE_MMX2
2310
        int i;
2311
        if(canMMX2BeUsed)
2312
        {
2313
                asm volatile(
2314
                "pxor %%mm7, %%mm7                \n\t"
2315
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
2316
                "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
2317
                "punpcklwd %%mm6, %%mm6                \n\t"
2318
                "punpcklwd %%mm6, %%mm6                \n\t"
2319
                "movq %%mm6, %%mm2                \n\t"
2320
                "psllq $16, %%mm2                \n\t"
2321
                "paddw %%mm6, %%mm2                \n\t"
2322
                "psllq $16, %%mm2                \n\t"
2323
                "paddw %%mm6, %%mm2                \n\t"
2324
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFFFF
2325
                "movq %%mm2, %%mm4                \n\t"
2326
                "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
2327
                "punpcklwd %%mm6, %%mm6                \n\t"
2328
                "punpcklwd %%mm6, %%mm6                \n\t"
2329
                "xorl %%eax, %%eax                \n\t" // i
2330
                "movl %0, %%esi                        \n\t" // src
2331
                "movl %1, %%edi                        \n\t" // buf1
2332
                "movl %3, %%edx                        \n\t" // (xInc*4)>>16
2333
                "xorl %%ecx, %%ecx                \n\t"
2334
                "xorl %%ebx, %%ebx                \n\t"
2335
                "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
2336

    
2337
#define FUNNYUVCODE \
2338
                        PREFETCH" 1024(%%esi)                \n\t"\
2339
                        PREFETCH" 1056(%%esi)                \n\t"\
2340
                        PREFETCH" 1088(%%esi)                \n\t"\
2341
                        "call *%7                        \n\t"\
2342
                        "movq %%mm4, %%mm2        \n\t"\
2343
                        "xorl %%ecx, %%ecx                \n\t"
2344

    
2345
FUNNYUVCODE
2346
FUNNYUVCODE
2347
FUNNYUVCODE
2348
FUNNYUVCODE
2349

    
2350
FUNNYUVCODE
2351
FUNNYUVCODE
2352
FUNNYUVCODE
2353
FUNNYUVCODE
2354
                "xorl %%eax, %%eax                \n\t" // i
2355
                "movl %6, %%esi                        \n\t" // src
2356
                "movl %1, %%edi                        \n\t" // buf1
2357
                "addl $4096, %%edi                \n\t"
2358

    
2359
FUNNYUVCODE
2360
FUNNYUVCODE
2361
FUNNYUVCODE
2362
FUNNYUVCODE
2363

    
2364
FUNNYUVCODE
2365
FUNNYUVCODE
2366
FUNNYUVCODE
2367
FUNNYUVCODE
2368

    
2369
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
2370
                  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
2371
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2372
        );
2373
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2374
                {
2375
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2376
                        dst[i] = src1[srcW-1]*128;
2377
                        dst[i+2048] = src2[srcW-1]*128;
2378
                }
2379
        }
2380
        else
2381
        {
2382
#endif
2383
        asm volatile(
2384
                "xorl %%eax, %%eax                \n\t" // i
2385
                "xorl %%ebx, %%ebx                \n\t" // xx
2386
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2387
                ".balign 16                        \n\t"
2388
                "1:                                \n\t"
2389
                "movl %0, %%esi                        \n\t"
2390
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
2391
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
2392
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2393
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2394
                "shll $16, %%edi                \n\t"
2395
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2396
                "movl %1, %%edi                        \n\t"
2397
                "shrl $9, %%esi                        \n\t"
2398
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
2399

    
2400
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
2401
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
2402
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2403
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2404
                "shll $16, %%edi                \n\t"
2405
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2406
                "movl %1, %%edi                        \n\t"
2407
                "shrl $9, %%esi                        \n\t"
2408
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2409

    
2410
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2411
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2412
                "addl $1, %%eax                        \n\t"
2413
                "cmpl %2, %%eax                        \n\t"
2414
                " jb 1b                                \n\t"
2415

    
2416
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2417
                "r" (src2)
2418
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2419
                );
2420
#ifdef HAVE_MMX2
2421
        } //if MMX2 cant be used
2422
#endif
2423
#else
2424
        int i;
2425
        unsigned int xpos=0;
2426
        for(i=0;i<dstWidth;i++)
2427
        {
2428
                register unsigned int xx=xpos>>16;
2429
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2430
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2431
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2432
/* slower
2433
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2434
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2435
*/
2436
                xpos+=xInc;
2437
        }
2438
#endif
2439
   }
2440
}
2441

    
2442
static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2443
             int srcSliceH, uint8_t* dstParam[], int dstStride[]){
2444

    
2445
        /* load a few things into local vars to make the code more readable? and faster */
2446
        const int srcW= c->srcW;
2447
        const int dstW= c->dstW;
2448
        const int dstH= c->dstH;
2449
        const int chrDstW= c->chrDstW;
2450
        const int lumXInc= c->lumXInc;
2451
        const int chrXInc= c->chrXInc;
2452
        const int dstFormat= c->dstFormat;
2453
        const int flags= c->flags;
2454
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2455
        int16_t *vLumFilterPos= c->vLumFilterPos;
2456
        int16_t *vChrFilterPos= c->vChrFilterPos;
2457
        int16_t *hLumFilterPos= c->hLumFilterPos;
2458
        int16_t *hChrFilterPos= c->hChrFilterPos;
2459
        int16_t *vLumFilter= c->vLumFilter;
2460
        int16_t *vChrFilter= c->vChrFilter;
2461
        int16_t *hLumFilter= c->hLumFilter;
2462
        int16_t *hChrFilter= c->hChrFilter;
2463
        int16_t *lumMmxFilter= c->lumMmxFilter;
2464
        int16_t *chrMmxFilter= c->chrMmxFilter;
2465
        const int vLumFilterSize= c->vLumFilterSize;
2466
        const int vChrFilterSize= c->vChrFilterSize;
2467
        const int hLumFilterSize= c->hLumFilterSize;
2468
        const int hChrFilterSize= c->hChrFilterSize;
2469
        int16_t **lumPixBuf= c->lumPixBuf;
2470
        int16_t **chrPixBuf= c->chrPixBuf;
2471
        const int vLumBufSize= c->vLumBufSize;
2472
        const int vChrBufSize= c->vChrBufSize;
2473
        uint8_t *funnyYCode= c->funnyYCode;
2474
        uint8_t *funnyUVCode= c->funnyUVCode;
2475
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2476

    
2477
        /* vars whch will change and which we need to storw back in the context */
2478
        int dstY= c->dstY;
2479
        int lumBufIndex= c->lumBufIndex;
2480
        int chrBufIndex= c->chrBufIndex;
2481
        int lastInLumBuf= c->lastInLumBuf;
2482
        int lastInChrBuf= c->lastInChrBuf;
2483
        int srcStride[3];
2484
        uint8_t *src[3];
2485
        uint8_t *dst[3];
2486
        
2487
        if(c->srcFormat == IMGFMT_I420){
2488
                src[0]= srcParam[0];
2489
                src[1]= srcParam[2];
2490
                src[2]= srcParam[1];
2491
                srcStride[0]= srcStrideParam[0];
2492
                srcStride[1]= srcStrideParam[2];
2493
                srcStride[2]= srcStrideParam[1];
2494
        }
2495
        else if(c->srcFormat==IMGFMT_YV12){
2496
                src[0]= srcParam[0];
2497
                src[1]= srcParam[1];
2498
                src[2]= srcParam[2];
2499
                srcStride[0]= srcStrideParam[0];
2500
                srcStride[1]= srcStrideParam[1];
2501
                srcStride[2]= srcStrideParam[2];
2502
        }
2503
        else if(isPacked(c->srcFormat)){
2504
                src[0]=
2505
                src[1]=
2506
                src[2]= srcParam[0];
2507
                srcStride[0]= srcStrideParam[0];
2508
                srcStride[1]=
2509
                srcStride[2]= srcStrideParam[0]<<1;
2510
        }
2511
        else if(isGray(c->srcFormat)){
2512
                src[0]= srcParam[0];
2513
                src[1]=
2514
                src[2]= NULL;
2515
                srcStride[0]= srcStrideParam[0];
2516
                srcStride[1]=
2517
                srcStride[2]= 0;
2518
        }
2519

    
2520
        if(c->dstFormat == IMGFMT_I420){
2521
                dst[0]= dstParam[0];
2522
                dst[1]= dstParam[2];
2523
                dst[2]= dstParam[1];
2524
                
2525
        }else{
2526
                dst[0]= dstParam[0];
2527
                dst[1]= dstParam[1];
2528
                dst[2]= dstParam[2];
2529
        }
2530

    
2531
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2532
//dstStride[0],dstStride[1],dstStride[2]);
2533

    
2534
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2535
        {
2536
                static int firstTime=1; //FIXME move this into the context perhaps
2537
                if(flags & SWS_PRINT_INFO && firstTime)
2538
                {
2539
                        fprintf(stderr, "SwScaler: Warning: dstStride is not aligned!\n"
2540
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2541
                        firstTime=0;
2542
                }
2543
        }
2544

    
2545
        /* Note the user might start scaling the picture in the middle so this will not get executed
2546
           this is not really intended but works currently, so ppl might do it */
2547
        if(srcSliceY ==0){
2548
                lumBufIndex=0;
2549
                chrBufIndex=0;
2550
                dstY=0;        
2551
                lastInLumBuf= -1;
2552
                lastInChrBuf= -1;
2553
        }
2554

    
2555
        for(;dstY < dstH; dstY++){
2556
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2557
                unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
2558
                unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
2559
                const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY;
2560

    
2561
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2562
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2563
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2564
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2565

    
2566
                //handle holes (FAST_BILINEAR & weird filters)
2567
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2568
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2569
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2570
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2571
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2572

    
2573
                // Do we have enough lines in this slice to output the dstY line
2574
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH)>>1))
2575
                {
2576
                        //Do horizontal scaling
2577
                        while(lastInLumBuf < lastLumSrcY)
2578
                        {
2579
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2580
                                lumBufIndex++;
2581
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2582
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2583
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2584
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2585
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2586
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2587
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2588
                                                funnyYCode, c->srcFormat, formatConvBuffer);
2589
                                lastInLumBuf++;
2590
                        }
2591
                        while(lastInChrBuf < lastChrSrcY)
2592
                        {
2593
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2594
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2595
                                chrBufIndex++;
2596
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2597
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2598
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2599
                                //FIXME replace parameters through context struct (some at least)
2600
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2601
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2602
                                                funnyUVCode, c->srcFormat, formatConvBuffer);
2603
                                lastInChrBuf++;
2604
                        }
2605
                        //wrap buf index around to stay inside the ring buffer
2606
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2607
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2608
                }
2609
                else // not enough lines left in this slice -> load the rest in the buffer
2610
                {
2611
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2612
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2613
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2614
                        vChrBufSize, vLumBufSize);
2615
*/
2616
                        //Do horizontal scaling
2617
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2618
                        {
2619
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2620
                                lumBufIndex++;
2621
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2622
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2623
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2624
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2625
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2626
                                                funnyYCode, c->srcFormat, formatConvBuffer);
2627
                                lastInLumBuf++;
2628
                        }
2629
                        while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2630
                        {
2631
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2632
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2633
                                chrBufIndex++;
2634
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2635
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < (srcSliceH>>1))
2636
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2637
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2638
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2639
                                                funnyUVCode, c->srcFormat, formatConvBuffer);
2640
                                lastInChrBuf++;
2641
                        }
2642
                        //wrap buf index around to stay inside the ring buffer
2643
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2644
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2645
                        break; //we cant output a dstY line so lets try with the next slice
2646
                }
2647

    
2648
#ifdef HAVE_MMX
2649
                b5Dither= dither8[dstY&1];
2650
                g6Dither= dither4[dstY&1];
2651
                g5Dither= dither8[dstY&1];
2652
                r5Dither= dither8[(dstY+1)&1];
2653
#endif
2654
            if(dstY < dstH-2)
2655
            {
2656
                if(isPlanarYUV(dstFormat)) //YV12 like
2657
                {
2658
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2659
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2660
                        {
2661
                                int16_t *lumBuf = lumPixBuf[0];
2662
                                int16_t *chrBuf= chrPixBuf[0];
2663
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2664
                        }
2665
                        else //General YV12
2666
                        {
2667
                                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2668
                                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2669
                                RENAME(yuv2yuvX)(
2670
                                        vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2671
                                        vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2672
                                        dest, uDest, vDest, dstW,
2673
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2674
                        }
2675
                }
2676
                else
2677
                {
2678
                        int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2679
                        int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2680

    
2681
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2682
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2683
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2684
                        {
2685
                                int chrAlpha= vChrFilter[2*dstY+1];
2686

    
2687
                                RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2688
                                                 dest, dstW, chrAlpha, dstFormat, flags);
2689
                        }
2690
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2691
                        {
2692
                                int lumAlpha= vLumFilter[2*dstY+1];
2693
                                int chrAlpha= vChrFilter[2*dstY+1];
2694

    
2695
                                RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2696
                                                 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
2697
                        }
2698
                        else //General RGB
2699
                        {
2700
                                RENAME(yuv2rgbX)(
2701
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2702
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2703
                                        dest, dstW, dstFormat,
2704
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2705
                        }
2706
                }
2707
            }
2708
            else // hmm looks like we cant use MMX here without overwriting this arrays tail
2709
            {
2710
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2711
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2712
                if(isPlanarYUV(dstFormat)) //YV12
2713
                {
2714
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2715
                        yuv2yuvXinC(
2716
                                vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2717
                                vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2718
                                dest, uDest, vDest, dstW);
2719
                }
2720
                else
2721
                {
2722
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2723
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2724
                        yuv2rgbXinC(
2725
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2726
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2727
                                dest, dstW, dstFormat);
2728
                }
2729
            }
2730
        }
2731

    
2732
#ifdef HAVE_MMX
2733
        __asm __volatile(SFENCE:::"memory");
2734
        __asm __volatile(EMMS:::"memory");
2735
#endif
2736
        /* store changed local vars back in the context */
2737
        c->dstY= dstY;
2738
        c->lumBufIndex= lumBufIndex;
2739
        c->chrBufIndex= chrBufIndex;
2740
        c->lastInLumBuf= lastInLumBuf;
2741
        c->lastInChrBuf= lastInChrBuf;
2742
}