Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ e616aa93

History | View | Annotate | Download (85.2 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef MOVNTQ
20
#undef PAVGB
21
#undef PREFETCH
22
#undef PREFETCHW
23
#undef EMMS
24
#undef SFENCE
25

    
26
#ifdef HAVE_3DNOW
27
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28
#define EMMS     "femms"
29
#else
30
#define EMMS     "emms"
31
#endif
32

    
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#elif defined ( HAVE_MMX2 )
37
#define PREFETCH "prefetchnta"
38
#define PREFETCHW "prefetcht0"
39
#else
40
#define PREFETCH "/nop"
41
#define PREFETCHW "/nop"
42
#endif
43

    
44
#ifdef HAVE_MMX2
45
#define SFENCE "sfence"
46
#else
47
#define SFENCE "/nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52
#elif defined (HAVE_3DNOW)
53
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58
#else
59
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60
#endif
61

    
62

    
63
#define YSCALEYUV2YV12X(x) \
64
                        "xorl %%eax, %%eax                \n\t"\
65
                        "pxor %%mm3, %%mm3                \n\t"\
66
                        "pxor %%mm4, %%mm4                \n\t"\
67
                        "movl %0, %%edx                        \n\t"\
68
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
69
                        "1:                                \n\t"\
70
                        "movl (%1, %%edx, 4), %%esi        \n\t"\
71
                        "movq (%2, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
72
                        "movq " #x "(%%esi, %%eax, 2), %%mm2        \n\t" /* srcData */\
73
                        "movq 8+" #x "(%%esi, %%eax, 2), %%mm5        \n\t" /* srcData */\
74
                        "pmulhw %%mm0, %%mm2                \n\t"\
75
                        "pmulhw %%mm0, %%mm5                \n\t"\
76
                        "paddw %%mm2, %%mm3                \n\t"\
77
                        "paddw %%mm5, %%mm4                \n\t"\
78
                        "addl $1, %%edx                        \n\t"\
79
                        " jnz 1b                        \n\t"\
80
                        "psraw $3, %%mm3                \n\t"\
81
                        "psraw $3, %%mm4                \n\t"\
82
                        "packuswb %%mm4, %%mm3                \n\t"\
83
                        MOVNTQ(%%mm3, (%3, %%eax))\
84
                        "addl $8, %%eax                        \n\t"\
85
                        "cmpl %4, %%eax                        \n\t"\
86
                        "pxor %%mm3, %%mm3                \n\t"\
87
                        "pxor %%mm4, %%mm4                \n\t"\
88
                        "movl %0, %%edx                        \n\t"\
89
                        "jb 1b                                \n\t"
90

    
91
#define YSCALEYUV2YV121 \
92
                        "movl %2, %%eax                        \n\t"\
93
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
94
                        "1:                                \n\t"\
95
                        "movq (%0, %%eax, 2), %%mm0        \n\t"\
96
                        "movq 8(%0, %%eax, 2), %%mm1        \n\t"\
97
                        "psraw $7, %%mm0                \n\t"\
98
                        "psraw $7, %%mm1                \n\t"\
99
                        "packuswb %%mm1, %%mm0                \n\t"\
100
                        MOVNTQ(%%mm0, (%1, %%eax))\
101
                        "addl $8, %%eax                        \n\t"\
102
                        "jnc 1b                                \n\t"
103

    
104
/*
105
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
106
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
107
                           "r" (dest), "m" (dstW),
108
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
109
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
110
*/
111
#define YSCALEYUV2RGBX \
112
                "xorl %%eax, %%eax                \n\t"\
113
                ".balign 16                        \n\t"\
114
                "1:                                \n\t"\
115
                "movl %1, %%edx                        \n\t" /* -chrFilterSize */\
116
                "movl %3, %%ebx                        \n\t" /* chrMmxFilter+lumFilterSize */\
117
                "movl %7, %%ecx                        \n\t" /* chrSrc+lumFilterSize */\
118
                "pxor %%mm3, %%mm3                \n\t"\
119
                "pxor %%mm4, %%mm4                \n\t"\
120
                "2:                                \n\t"\
121
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
122
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
123
                "movq (%%esi, %%eax), %%mm2        \n\t" /* UsrcData */\
124
                "movq 4096(%%esi, %%eax), %%mm5        \n\t" /* VsrcData */\
125
                "pmulhw %%mm0, %%mm2                \n\t"\
126
                "pmulhw %%mm0, %%mm5                \n\t"\
127
                "paddw %%mm2, %%mm3                \n\t"\
128
                "paddw %%mm5, %%mm4                \n\t"\
129
                "addl $1, %%edx                        \n\t"\
130
                " jnz 2b                        \n\t"\
131
\
132
                "movl %0, %%edx                        \n\t" /* -lumFilterSize */\
133
                "movl %2, %%ebx                        \n\t" /* lumMmxFilter+lumFilterSize */\
134
                "movl %6, %%ecx                        \n\t" /* lumSrc+lumFilterSize */\
135
                "pxor %%mm1, %%mm1                \n\t"\
136
                "pxor %%mm7, %%mm7                \n\t"\
137
                "2:                                \n\t"\
138
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
139
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
140
                "movq (%%esi, %%eax, 2), %%mm2        \n\t" /* Y1srcData */\
141
                "movq 8(%%esi, %%eax, 2), %%mm5        \n\t" /* Y2srcData */\
142
                "pmulhw %%mm0, %%mm2                \n\t"\
143
                "pmulhw %%mm0, %%mm5                \n\t"\
144
                "paddw %%mm2, %%mm1                \n\t"\
145
                "paddw %%mm5, %%mm7                \n\t"\
146
                "addl $1, %%edx                        \n\t"\
147
                " jnz 2b                        \n\t"\
148
\
149
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
150
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
151
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
152
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
153
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
154
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
155
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
156
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
157
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
158
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
159
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
160
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
161
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
162
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
163
                "paddw %%mm3, %%mm4                \n\t"\
164
                "movq %%mm2, %%mm0                \n\t"\
165
                "movq %%mm5, %%mm6                \n\t"\
166
                "movq %%mm4, %%mm3                \n\t"\
167
                "punpcklwd %%mm2, %%mm2                \n\t"\
168
                "punpcklwd %%mm5, %%mm5                \n\t"\
169
                "punpcklwd %%mm4, %%mm4                \n\t"\
170
                "paddw %%mm1, %%mm2                \n\t"\
171
                "paddw %%mm1, %%mm5                \n\t"\
172
                "paddw %%mm1, %%mm4                \n\t"\
173
                "punpckhwd %%mm0, %%mm0                \n\t"\
174
                "punpckhwd %%mm6, %%mm6                \n\t"\
175
                "punpckhwd %%mm3, %%mm3                \n\t"\
176
                "paddw %%mm7, %%mm0                \n\t"\
177
                "paddw %%mm7, %%mm6                \n\t"\
178
                "paddw %%mm7, %%mm3                \n\t"\
179
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
180
                "packuswb %%mm0, %%mm2                \n\t"\
181
                "packuswb %%mm6, %%mm5                \n\t"\
182
                "packuswb %%mm3, %%mm4                \n\t"\
183
                "pxor %%mm7, %%mm7                \n\t"
184

    
185
#define FULL_YSCALEYUV2RGB \
186
                "pxor %%mm7, %%mm7                \n\t"\
187
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
188
                "punpcklwd %%mm6, %%mm6                \n\t"\
189
                "punpcklwd %%mm6, %%mm6                \n\t"\
190
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
191
                "punpcklwd %%mm5, %%mm5                \n\t"\
192
                "punpcklwd %%mm5, %%mm5                \n\t"\
193
                "xorl %%eax, %%eax                \n\t"\
194
                ".balign 16                        \n\t"\
195
                "1:                                \n\t"\
196
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
197
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
198
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
199
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
200
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
201
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
202
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
203
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
204
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
205
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
206
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
207
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
208
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
209
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
210
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
211
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
212
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
213
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
214
\
215
\
216
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
217
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
218
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
219
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
220
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
221
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
222
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
223
\
224
\
225
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
226
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
227
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
228
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
229
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
230
                "packuswb %%mm3, %%mm3                \n\t"\
231
\
232
                "packuswb %%mm0, %%mm0                \n\t"\
233
                "paddw %%mm4, %%mm2                \n\t"\
234
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
235
\
236
                "packuswb %%mm1, %%mm1                \n\t"
237

    
238
#define YSCALEYUV2RGB \
239
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
240
                "punpcklwd %%mm6, %%mm6                \n\t"\
241
                "punpcklwd %%mm6, %%mm6                \n\t"\
242
                "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
243
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
244
                "punpcklwd %%mm5, %%mm5                \n\t"\
245
                "punpcklwd %%mm5, %%mm5                \n\t"\
246
                "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
247
                "xorl %%eax, %%eax                \n\t"\
248
                ".balign 16                        \n\t"\
249
                "1:                                \n\t"\
250
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
251
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
252
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
253
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
254
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
255
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
256
                "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
257
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
258
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
259
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
260
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
261
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
262
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
263
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
264
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
265
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
266
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
267
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
268
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
269
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
270
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
271
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
272
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
273
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
274
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
275
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
276
                "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277
                "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
278
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
280
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
282
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
283
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
284
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
285
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
286
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
287
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
288
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289
                "paddw %%mm3, %%mm4                \n\t"\
290
                "movq %%mm2, %%mm0                \n\t"\
291
                "movq %%mm5, %%mm6                \n\t"\
292
                "movq %%mm4, %%mm3                \n\t"\
293
                "punpcklwd %%mm2, %%mm2                \n\t"\
294
                "punpcklwd %%mm5, %%mm5                \n\t"\
295
                "punpcklwd %%mm4, %%mm4                \n\t"\
296
                "paddw %%mm1, %%mm2                \n\t"\
297
                "paddw %%mm1, %%mm5                \n\t"\
298
                "paddw %%mm1, %%mm4                \n\t"\
299
                "punpckhwd %%mm0, %%mm0                \n\t"\
300
                "punpckhwd %%mm6, %%mm6                \n\t"\
301
                "punpckhwd %%mm3, %%mm3                \n\t"\
302
                "paddw %%mm7, %%mm0                \n\t"\
303
                "paddw %%mm7, %%mm6                \n\t"\
304
                "paddw %%mm7, %%mm3                \n\t"\
305
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306
                "packuswb %%mm0, %%mm2                \n\t"\
307
                "packuswb %%mm6, %%mm5                \n\t"\
308
                "packuswb %%mm3, %%mm4                \n\t"\
309
                "pxor %%mm7, %%mm7                \n\t"
310

    
311
#define YSCALEYUV2RGB1 \
312
                "xorl %%eax, %%eax                \n\t"\
313
                ".balign 16                        \n\t"\
314
                "1:                                \n\t"\
315
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
316
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
317
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
318
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
319
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
320
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
321
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
322
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
323
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
324
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
325
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
326
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
327
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
328
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
330
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
331
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
332
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
333
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
334
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
335
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
336
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
337
                "paddw %%mm3, %%mm4                \n\t"\
338
                "movq %%mm2, %%mm0                \n\t"\
339
                "movq %%mm5, %%mm6                \n\t"\
340
                "movq %%mm4, %%mm3                \n\t"\
341
                "punpcklwd %%mm2, %%mm2                \n\t"\
342
                "punpcklwd %%mm5, %%mm5                \n\t"\
343
                "punpcklwd %%mm4, %%mm4                \n\t"\
344
                "paddw %%mm1, %%mm2                \n\t"\
345
                "paddw %%mm1, %%mm5                \n\t"\
346
                "paddw %%mm1, %%mm4                \n\t"\
347
                "punpckhwd %%mm0, %%mm0                \n\t"\
348
                "punpckhwd %%mm6, %%mm6                \n\t"\
349
                "punpckhwd %%mm3, %%mm3                \n\t"\
350
                "paddw %%mm7, %%mm0                \n\t"\
351
                "paddw %%mm7, %%mm6                \n\t"\
352
                "paddw %%mm7, %%mm3                \n\t"\
353
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
354
                "packuswb %%mm0, %%mm2                \n\t"\
355
                "packuswb %%mm6, %%mm5                \n\t"\
356
                "packuswb %%mm3, %%mm4                \n\t"\
357
                "pxor %%mm7, %%mm7                \n\t"
358

    
359
// do vertical chrominance interpolation
360
#define YSCALEYUV2RGB1b \
361
                "xorl %%eax, %%eax                \n\t"\
362
                ".balign 16                        \n\t"\
363
                "1:                                \n\t"\
364
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
365
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
366
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
367
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
368
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
369
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
370
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
371
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
372
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
373
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
374
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
375
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
376
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
377
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
378
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
379
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
380
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
381
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
384
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
385
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
386
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
387
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
388
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
389
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
390
                "paddw %%mm3, %%mm4                \n\t"\
391
                "movq %%mm2, %%mm0                \n\t"\
392
                "movq %%mm5, %%mm6                \n\t"\
393
                "movq %%mm4, %%mm3                \n\t"\
394
                "punpcklwd %%mm2, %%mm2                \n\t"\
395
                "punpcklwd %%mm5, %%mm5                \n\t"\
396
                "punpcklwd %%mm4, %%mm4                \n\t"\
397
                "paddw %%mm1, %%mm2                \n\t"\
398
                "paddw %%mm1, %%mm5                \n\t"\
399
                "paddw %%mm1, %%mm4                \n\t"\
400
                "punpckhwd %%mm0, %%mm0                \n\t"\
401
                "punpckhwd %%mm6, %%mm6                \n\t"\
402
                "punpckhwd %%mm3, %%mm3                \n\t"\
403
                "paddw %%mm7, %%mm0                \n\t"\
404
                "paddw %%mm7, %%mm6                \n\t"\
405
                "paddw %%mm7, %%mm3                \n\t"\
406
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
407
                "packuswb %%mm0, %%mm2                \n\t"\
408
                "packuswb %%mm6, %%mm5                \n\t"\
409
                "packuswb %%mm3, %%mm4                \n\t"\
410
                "pxor %%mm7, %%mm7                \n\t"
411

    
412
#define WRITEBGR32 \
413
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
414
                        "movq %%mm2, %%mm1                \n\t" /* B */\
415
                        "movq %%mm5, %%mm6                \n\t" /* R */\
416
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
417
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
418
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
419
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
420
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
421
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
422
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
423
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
424
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
425
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
426
\
427
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
428
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
429
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
430
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
431
\
432
                        "addl $8, %%eax                        \n\t"\
433
                        "cmpl %5, %%eax                        \n\t"\
434
                        " jb 1b                                \n\t"
435

    
436
#define WRITEBGR16 \
437
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
438
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
439
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
440
                        "psrlq $3, %%mm2                \n\t"\
441
\
442
                        "movq %%mm2, %%mm1                \n\t"\
443
                        "movq %%mm4, %%mm3                \n\t"\
444
\
445
                        "punpcklbw %%mm7, %%mm3                \n\t"\
446
                        "punpcklbw %%mm5, %%mm2                \n\t"\
447
                        "punpckhbw %%mm7, %%mm4                \n\t"\
448
                        "punpckhbw %%mm5, %%mm1                \n\t"\
449
\
450
                        "psllq $3, %%mm3                \n\t"\
451
                        "psllq $3, %%mm4                \n\t"\
452
\
453
                        "por %%mm3, %%mm2                \n\t"\
454
                        "por %%mm4, %%mm1                \n\t"\
455
\
456
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
457
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
458
\
459
                        "addl $8, %%eax                        \n\t"\
460
                        "cmpl %5, %%eax                        \n\t"\
461
                        " jb 1b                                \n\t"
462

    
463
#define WRITEBGR15 \
464
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
465
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
466
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
467
                        "psrlq $3, %%mm2                \n\t"\
468
                        "psrlq $1, %%mm5                \n\t"\
469
\
470
                        "movq %%mm2, %%mm1                \n\t"\
471
                        "movq %%mm4, %%mm3                \n\t"\
472
\
473
                        "punpcklbw %%mm7, %%mm3                \n\t"\
474
                        "punpcklbw %%mm5, %%mm2                \n\t"\
475
                        "punpckhbw %%mm7, %%mm4                \n\t"\
476
                        "punpckhbw %%mm5, %%mm1                \n\t"\
477
\
478
                        "psllq $2, %%mm3                \n\t"\
479
                        "psllq $2, %%mm4                \n\t"\
480
\
481
                        "por %%mm3, %%mm2                \n\t"\
482
                        "por %%mm4, %%mm1                \n\t"\
483
\
484
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
485
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
486
\
487
                        "addl $8, %%eax                        \n\t"\
488
                        "cmpl %5, %%eax                        \n\t"\
489
                        " jb 1b                                \n\t"
490

    
491
#define WRITEBGR24OLD \
492
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493
                        "movq %%mm2, %%mm1                \n\t" /* B */\
494
                        "movq %%mm5, %%mm6                \n\t" /* R */\
495
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
496
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
497
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
498
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
499
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
500
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
501
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
502
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
503
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
504
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
505
\
506
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
507
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
508
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
509
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
510
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
511
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
512
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
513
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
514
\
515
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
516
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
517
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
518
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
519
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
520
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
521
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
522
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
523
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
524
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
525
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
526
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
527
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
528
\
529
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
530
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
531
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
532
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
533
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
534
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
535
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
536
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
537
\
538
                        MOVNTQ(%%mm0, (%%ebx))\
539
                        MOVNTQ(%%mm2, 8(%%ebx))\
540
                        MOVNTQ(%%mm3, 16(%%ebx))\
541
                        "addl $24, %%ebx                \n\t"\
542
\
543
                        "addl $8, %%eax                        \n\t"\
544
                        "cmpl %5, %%eax                        \n\t"\
545
                        " jb 1b                                \n\t"
546

    
547
#define WRITEBGR24MMX \
548
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
549
                        "movq %%mm2, %%mm1                \n\t" /* B */\
550
                        "movq %%mm5, %%mm6                \n\t" /* R */\
551
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
552
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
553
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
554
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
555
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
556
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
557
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
558
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
559
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
560
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
561
\
562
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
563
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
564
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
565
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
566
\
567
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
568
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
569
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
570
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
571
\
572
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
573
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
574
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
575
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
576
\
577
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
578
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
579
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
580
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
581
                        MOVNTQ(%%mm0, (%%ebx))\
582
\
583
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
584
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
585
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
586
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
587
                        MOVNTQ(%%mm6, 8(%%ebx))\
588
\
589
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
590
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
591
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
592
                        MOVNTQ(%%mm5, 16(%%ebx))\
593
\
594
                        "addl $24, %%ebx                \n\t"\
595
\
596
                        "addl $8, %%eax                        \n\t"\
597
                        "cmpl %5, %%eax                        \n\t"\
598
                        " jb 1b                                \n\t"
599

    
600
#define WRITEBGR24MMX2 \
601
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
602
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
603
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
604
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
605
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
606
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
607
\
608
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
609
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
610
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
611
\
612
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
613
                        "por %%mm1, %%mm6                \n\t"\
614
                        "por %%mm3, %%mm6                \n\t"\
615
                        MOVNTQ(%%mm6, (%%ebx))\
616
\
617
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
618
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
619
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
620
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
621
\
622
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
623
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
624
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
625
\
626
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
627
                        "por %%mm3, %%mm6                \n\t"\
628
                        MOVNTQ(%%mm6, 8(%%ebx))\
629
\
630
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
631
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
632
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
633
\
634
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
635
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
636
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
637
\
638
                        "por %%mm1, %%mm3                \n\t"\
639
                        "por %%mm3, %%mm6                \n\t"\
640
                        MOVNTQ(%%mm6, 16(%%ebx))\
641
\
642
                        "addl $24, %%ebx                \n\t"\
643
\
644
                        "addl $8, %%eax                        \n\t"\
645
                        "cmpl %5, %%eax                        \n\t"\
646
                        " jb 1b                                \n\t"
647

    
648
#ifdef HAVE_MMX2
649
#undef WRITEBGR24
650
#define WRITEBGR24 WRITEBGR24MMX2
651
#else
652
#undef WRITEBGR24
653
#define WRITEBGR24 WRITEBGR24MMX
654
#endif
655

    
656
static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
657
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
658
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW,
659
                                    int16_t * lumMmxFilter, int16_t * chrMmxFilter)
660
{
661
#ifdef HAVE_MMX
662
        if(uDest != NULL)
663
        {
664
                asm volatile(
665
                                YSCALEYUV2YV12X(0)
666
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
667
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (chrDstW)
668
                                : "%eax", "%edx", "%esi"
669
                        );
670

    
671
                asm volatile(
672
                                YSCALEYUV2YV12X(4096)
673
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
674
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (chrDstW)
675
                                : "%eax", "%edx", "%esi"
676
                        );
677
        }
678

    
679
        asm volatile(
680
                        YSCALEYUV2YV12X(0)
681
                        :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
682
                           "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
683
                        : "%eax", "%edx", "%esi"
684
                );
685
#else
686
yuv2yuvXinC(c, lumFilter, lumSrc, lumFilterSize,
687
            chrFilter, chrSrc, chrFilterSize,
688
            dest, uDest, vDest);
689
#endif
690
}
691

    
692
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
693
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW, int chrDstW)
694
{
695
#ifdef HAVE_MMX
696
        if(uDest != NULL)
697
        {
698
                asm volatile(
699
                                YSCALEYUV2YV121
700
                                :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
701
                                "g" (-chrDstW)
702
                                : "%eax"
703
                        );
704

    
705
                asm volatile(
706
                                YSCALEYUV2YV121
707
                                :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
708
                                "g" (-chrDstW)
709
                                : "%eax"
710
                        );
711
        }
712

    
713
        asm volatile(
714
                YSCALEYUV2YV121
715
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
716
                "g" (-dstW)
717
                : "%eax"
718
        );
719
#else
720
        int i;
721
        for(i=0; i<dstW; i++)
722
        {
723
                int val= lumSrc[i]>>7;
724
                
725
                if(val&256){
726
                        if(val<0) val=0;
727
                        else      val=255;
728
                }
729

    
730
                dest[i]= val;
731
        }
732

    
733
        if(uDest != NULL)
734
                for(i=0; i<chrDstW; i++)
735
                {
736
                        int u=chrSrc[i]>>7;
737
                        int v=chrSrc[i + 2048]>>7;
738

    
739
                        if((u|v)&256){
740
                                if(u<0)         u=0;
741
                                else if (u>255) u=255;
742
                                if(v<0)         v=0;
743
                                else if (v>255) v=255;
744
                        }
745

    
746
                        uDest[i]= u;
747
                        vDest[i]= v;
748
                }
749
#endif
750
}
751

    
752

    
753
/**
754
 * vertical scale YV12 to RGB
755
 */
756
static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
757
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
758
                            uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
759
{
760
/*        if(flags&SWS_FULL_UV_IPOL)
761
        {
762
//FIXME
763
        }//FULL_UV_IPOL
764
        else*/
765
        {
766
#ifdef HAVE_MMX
767
                if(dstFormat == IMGFMT_BGR32) //FIXME untested
768
                {
769
                        asm volatile(
770
                                YSCALEYUV2RGBX
771
                                WRITEBGR32
772

    
773
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
774
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
775
                           "r" (dest), "m" (dstW),
776
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
777
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
778
                        );
779
                }
780
                else if(dstFormat == IMGFMT_BGR24) //FIXME untested
781
                {
782
                        asm volatile(
783
                                YSCALEYUV2RGBX
784
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t" //FIXME optimize
785
                                "addl %4, %%ebx                        \n\t"
786
                                WRITEBGR24
787

    
788
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
789
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
790
                           "r" (dest), "m" (dstW),
791
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
792
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
793
                        );
794
                }
795
                else if(dstFormat==IMGFMT_BGR15)
796
                {
797
                        asm volatile(
798
                                YSCALEYUV2RGBX
799
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
800
#ifdef DITHER1XBPP
801
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
802
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
803
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
804
#endif
805

    
806
                                WRITEBGR15
807

    
808
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
809
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
810
                           "r" (dest), "m" (dstW),
811
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
812
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
813
                        );
814
                }
815
                else if(dstFormat==IMGFMT_BGR16)
816
                {
817
                        asm volatile(
818
                                YSCALEYUV2RGBX
819
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
820
#ifdef DITHER1XBPP
821
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
822
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
823
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
824
#endif
825

    
826
                                WRITEBGR16
827

    
828
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
829
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
830
                           "r" (dest), "m" (dstW),
831
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
832
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
833
                        );
834
                }
835
#else
836
yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
837
            chrFilter, chrSrc, chrFilterSize,
838
            dest, dstW, dstFormat);
839

    
840
#endif
841
        } //!FULL_UV_IPOL
842
}
843

    
844

    
845
/**
846
 * vertical bilinear scale YV12 to RGB
847
 */
848
static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
849
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
850
{
851
        int yalpha1=yalpha^4095;
852
        int uvalpha1=uvalpha^4095;
853

    
854
        if(flags&SWS_FULL_CHR_H_INT)
855
        {
856

    
857
#ifdef HAVE_MMX
858
                if(dstFormat==IMGFMT_BGR32)
859
                {
860
                        asm volatile(
861

    
862

    
863
FULL_YSCALEYUV2RGB
864
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
865
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
866

    
867
                        "movq %%mm3, %%mm1                \n\t"
868
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
869
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
870

    
871
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
872
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
873

    
874
                        "addl $4, %%eax                        \n\t"
875
                        "cmpl %5, %%eax                        \n\t"
876
                        " jb 1b                                \n\t"
877

    
878

    
879
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
880
                        "m" (yalpha1), "m" (uvalpha1)
881
                        : "%eax"
882
                        );
883
                }
884
                else if(dstFormat==IMGFMT_BGR24)
885
                {
886
                        asm volatile(
887

    
888
FULL_YSCALEYUV2RGB
889

    
890
                                                                // lsb ... msb
891
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
892
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
893

    
894
                        "movq %%mm3, %%mm1                \n\t"
895
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
896
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
897

    
898
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
899
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
900
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
901
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
902
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
903
                        "movq %%mm1, %%mm2                \n\t"
904
                        "psllq $48, %%mm1                \n\t" // 000000BG
905
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
906

    
907
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
908
                        "psrld $16, %%mm2                \n\t" // R000R000
909
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
910
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
911

    
912
                        "movl %4, %%ebx                        \n\t"
913
                        "addl %%eax, %%ebx                \n\t"
914

    
915
#ifdef HAVE_MMX2
916
                        //FIXME Alignment
917
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
918
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
919
#else
920
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
921
                        "psrlq $32, %%mm3                \n\t"
922
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
923
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
924
#endif
925
                        "addl $4, %%eax                        \n\t"
926
                        "cmpl %5, %%eax                        \n\t"
927
                        " jb 1b                                \n\t"
928

    
929
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
930
                        "m" (yalpha1), "m" (uvalpha1)
931
                        : "%eax", "%ebx"
932
                        );
933
                }
934
                else if(dstFormat==IMGFMT_BGR15)
935
                {
936
                        asm volatile(
937

    
938
FULL_YSCALEYUV2RGB
939
#ifdef DITHER1XBPP
940
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
941
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
942
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
943
#endif
944
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
945
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
946
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
947

    
948
                        "psrlw $3, %%mm3                \n\t"
949
                        "psllw $2, %%mm1                \n\t"
950
                        "psllw $7, %%mm0                \n\t"
951
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
952
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
953

    
954
                        "por %%mm3, %%mm1                \n\t"
955
                        "por %%mm1, %%mm0                \n\t"
956

    
957
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
958

    
959
                        "addl $4, %%eax                        \n\t"
960
                        "cmpl %5, %%eax                        \n\t"
961
                        " jb 1b                                \n\t"
962

    
963
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
964
                        "m" (yalpha1), "m" (uvalpha1)
965
                        : "%eax"
966
                        );
967
                }
968
                else if(dstFormat==IMGFMT_BGR16)
969
                {
970
                        asm volatile(
971

    
972
FULL_YSCALEYUV2RGB
973
#ifdef DITHER1XBPP
974
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
975
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
976
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
977
#endif
978
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
979
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
980
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
981

    
982
                        "psrlw $3, %%mm3                \n\t"
983
                        "psllw $3, %%mm1                \n\t"
984
                        "psllw $8, %%mm0                \n\t"
985
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
986
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
987

    
988
                        "por %%mm3, %%mm1                \n\t"
989
                        "por %%mm1, %%mm0                \n\t"
990

    
991
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
992

    
993
                        "addl $4, %%eax                        \n\t"
994
                        "cmpl %5, %%eax                        \n\t"
995
                        " jb 1b                                \n\t"
996

    
997
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
998
                        "m" (yalpha1), "m" (uvalpha1)
999
                        : "%eax"
1000
                        );
1001
                }
1002
#else
1003
                if(dstFormat==IMGFMT_BGR32)
1004
                {
1005
                        int i;
1006
#ifdef WORDS_BIGENDIAN
1007
                        dest++;
1008
#endif
1009
                        for(i=0;i<dstW;i++){
1010
                                // vertical linear interpolation && yuv2rgb in a single step:
1011
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1012
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1013
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1014
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1015
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1016
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1017
                                dest+= 4;
1018
                        }
1019
                }
1020
                else if(dstFormat==IMGFMT_BGR24)
1021
                {
1022
                        int i;
1023
                        for(i=0;i<dstW;i++){
1024
                                // vertical linear interpolation && yuv2rgb in a single step:
1025
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1026
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1027
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1028
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1029
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1030
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1031
                                dest+= 3;
1032
                        }
1033
                }
1034
                else if(dstFormat==IMGFMT_BGR16)
1035
                {
1036
                        int i;
1037
                        for(i=0;i<dstW;i++){
1038
                                // vertical linear interpolation && yuv2rgb in a single step:
1039
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1040
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1041
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1042

    
1043
                                ((uint16_t*)dest)[i] =
1044
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1045
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1046
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1047
                        }
1048
                }
1049
                else if(dstFormat==IMGFMT_BGR15)
1050
                {
1051
                        int i;
1052
                        for(i=0;i<dstW;i++){
1053
                                // vertical linear interpolation && yuv2rgb in a single step:
1054
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1055
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1056
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1057

    
1058
                                ((uint16_t*)dest)[i] =
1059
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1060
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1061
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1062
                        }
1063
                }
1064
#endif
1065
        }//FULL_UV_IPOL
1066
        else
1067
        {
1068
#ifdef HAVE_MMX
1069
                if(dstFormat==IMGFMT_BGR32)
1070
                {
1071
                        asm volatile(
1072
                                YSCALEYUV2RGB
1073
                                WRITEBGR32
1074

    
1075
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1076
                        "m" (yalpha1), "m" (uvalpha1)
1077
                        : "%eax"
1078
                        );
1079
                }
1080
                else if(dstFormat==IMGFMT_BGR24)
1081
                {
1082
                        asm volatile(
1083
                                "movl %4, %%ebx                        \n\t"
1084
                                YSCALEYUV2RGB
1085
                                WRITEBGR24
1086

    
1087
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1088
                        "m" (yalpha1), "m" (uvalpha1)
1089
                        : "%eax", "%ebx"
1090
                        );
1091
                }
1092
                else if(dstFormat==IMGFMT_BGR15)
1093
                {
1094
                        asm volatile(
1095
                                YSCALEYUV2RGB
1096
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1097
#ifdef DITHER1XBPP
1098
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1099
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1100
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1101
#endif
1102

    
1103
                                WRITEBGR15
1104

    
1105
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1106
                        "m" (yalpha1), "m" (uvalpha1)
1107
                        : "%eax"
1108
                        );
1109
                }
1110
                else if(dstFormat==IMGFMT_BGR16)
1111
                {
1112
                        asm volatile(
1113
                                YSCALEYUV2RGB
1114
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1115
#ifdef DITHER1XBPP
1116
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1117
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1118
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1119
#endif
1120

    
1121
                                WRITEBGR16
1122

    
1123
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1124
                        "m" (yalpha1), "m" (uvalpha1)
1125
                        : "%eax"
1126
                        );
1127
                }
1128
#else
1129
                if(dstFormat==IMGFMT_BGR32)
1130
                {
1131
                        int i;
1132
#ifdef WORDS_BIGENDIAN
1133
                        dest++;
1134
#endif
1135
                        for(i=0; i<dstW-1; i+=2){
1136
                                // vertical linear interpolation && yuv2rgb in a single step:
1137
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1138
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1139
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1140
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1141

    
1142
                                int Cb= yuvtab_40cf[U];
1143
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1144
                                int Cr= yuvtab_3343[V];
1145

    
1146
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1147
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1148
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1149

    
1150
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1151
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1152
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1153
                        }
1154
                }
1155
                else if(dstFormat==IMGFMT_BGR24)
1156
                {
1157
                        int i;
1158
                        for(i=0; i<dstW-1; i+=2){
1159
                                // vertical linear interpolation && yuv2rgb in a single step:
1160
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1161
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1162
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1163
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1164

    
1165
                                int Cb= yuvtab_40cf[U];
1166
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1167
                                int Cr= yuvtab_3343[V];
1168

    
1169
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
1170
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
1171
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
1172

    
1173
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
1174
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
1175
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
1176
                                dest+=6;
1177
                        }
1178
                }
1179
                else if(dstFormat==IMGFMT_BGR16)
1180
                {
1181
                        int i;
1182
#ifdef DITHER1XBPP
1183
                        static int ditherb1=1<<14;
1184
                        static int ditherg1=1<<13;
1185
                        static int ditherr1=2<<14;
1186
                        static int ditherb2=3<<14;
1187
                        static int ditherg2=3<<13;
1188
                        static int ditherr2=0<<14;
1189

    
1190
                        ditherb1 ^= (1^2)<<14;
1191
                        ditherg1 ^= (1^2)<<13;
1192
                        ditherr1 ^= (1^2)<<14;
1193
                        ditherb2 ^= (3^0)<<14;
1194
                        ditherg2 ^= (3^0)<<13;
1195
                        ditherr2 ^= (3^0)<<14;
1196
#else
1197
                        const int ditherb1=0;
1198
                        const int ditherg1=0;
1199
                        const int ditherr1=0;
1200
                        const int ditherb2=0;
1201
                        const int ditherg2=0;
1202
                        const int ditherr2=0;
1203
#endif
1204
                        for(i=0; i<dstW-1; i+=2){
1205
                                // vertical linear interpolation && yuv2rgb in a single step:
1206
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1207
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1208
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1209
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1210

    
1211
                                int Cb= yuvtab_40cf[U];
1212
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1213
                                int Cr= yuvtab_3343[V];
1214

    
1215
                                ((uint16_t*)dest)[i] =
1216
                                        clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1217
                                        clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1218
                                        clip_table16r[(Y1 + Cr + ditherr1) >>13];
1219

    
1220
                                ((uint16_t*)dest)[i+1] =
1221
                                        clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1222
                                        clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1223
                                        clip_table16r[(Y2 + Cr + ditherr2) >>13];
1224
                        }
1225
                }
1226
                else if(dstFormat==IMGFMT_BGR15)
1227
                {
1228
                        int i;
1229
#ifdef DITHER1XBPP
1230
                        static int ditherb1=1<<14;
1231
                        static int ditherg1=1<<14;
1232
                        static int ditherr1=2<<14;
1233
                        static int ditherb2=3<<14;
1234
                        static int ditherg2=3<<14;
1235
                        static int ditherr2=0<<14;
1236

    
1237
                        ditherb1 ^= (1^2)<<14;
1238
                        ditherg1 ^= (1^2)<<14;
1239
                        ditherr1 ^= (1^2)<<14;
1240
                        ditherb2 ^= (3^0)<<14;
1241
                        ditherg2 ^= (3^0)<<14;
1242
                        ditherr2 ^= (3^0)<<14;
1243
#else
1244
                        const int ditherb1=0;
1245
                        const int ditherg1=0;
1246
                        const int ditherr1=0;
1247
                        const int ditherb2=0;
1248
                        const int ditherg2=0;
1249
                        const int ditherr2=0;
1250
#endif
1251
                        for(i=0; i<dstW-1; i+=2){
1252
                                // vertical linear interpolation && yuv2rgb in a single step:
1253
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1254
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1255
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1256
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1257

    
1258
                                int Cb= yuvtab_40cf[U];
1259
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1260
                                int Cr= yuvtab_3343[V];
1261

    
1262
                                ((uint16_t*)dest)[i] =
1263
                                        clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1264
                                        clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1265
                                        clip_table15r[(Y1 + Cr + ditherr1) >>13];
1266

    
1267
                                ((uint16_t*)dest)[i+1] =
1268
                                        clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1269
                                        clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1270
                                        clip_table15r[(Y2 + Cr + ditherr2) >>13];
1271
                        }
1272
                }
1273
#endif
1274
        } //!FULL_UV_IPOL
1275
}
1276

    
1277
/**
1278
 * YV12 to RGB without scaling or interpolating
1279
 */
1280
static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1281
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
1282
{
1283
        int uvalpha1=uvalpha^4095;
1284
        const int yalpha1=0;
1285

    
1286
        if(flags&SWS_FULL_CHR_H_INT)
1287
        {
1288
                RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
1289
                return;
1290
        }
1291

    
1292
#ifdef HAVE_MMX
1293
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1294
        {
1295
                if(dstFormat==IMGFMT_BGR32)
1296
                {
1297
                        asm volatile(
1298
                                YSCALEYUV2RGB1
1299
                                WRITEBGR32
1300
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1301
                        "m" (yalpha1), "m" (uvalpha1)
1302
                        : "%eax"
1303
                        );
1304
                }
1305
                else if(dstFormat==IMGFMT_BGR24)
1306
                {
1307
                        asm volatile(
1308
                                "movl %4, %%ebx                        \n\t"
1309
                                YSCALEYUV2RGB1
1310
                                WRITEBGR24
1311
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1312
                        "m" (yalpha1), "m" (uvalpha1)
1313
                        : "%eax", "%ebx"
1314
                        );
1315
                }
1316
                else if(dstFormat==IMGFMT_BGR15)
1317
                {
1318
                        asm volatile(
1319
                                YSCALEYUV2RGB1
1320
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321
#ifdef DITHER1XBPP
1322
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1323
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1324
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1325
#endif
1326
                                WRITEBGR15
1327
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1328
                        "m" (yalpha1), "m" (uvalpha1)
1329
                        : "%eax"
1330
                        );
1331
                }
1332
                else if(dstFormat==IMGFMT_BGR16)
1333
                {
1334
                        asm volatile(
1335
                                YSCALEYUV2RGB1
1336
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1337
#ifdef DITHER1XBPP
1338
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1339
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1340
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1341
#endif
1342

    
1343
                                WRITEBGR16
1344
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1345
                        "m" (yalpha1), "m" (uvalpha1)
1346
                        : "%eax"
1347
                        );
1348
                }
1349
        }
1350
        else
1351
        {
1352
                if(dstFormat==IMGFMT_BGR32)
1353
                {
1354
                        asm volatile(
1355
                                YSCALEYUV2RGB1b
1356
                                WRITEBGR32
1357
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1358
                        "m" (yalpha1), "m" (uvalpha1)
1359
                        : "%eax"
1360
                        );
1361
                }
1362
                else if(dstFormat==IMGFMT_BGR24)
1363
                {
1364
                        asm volatile(
1365
                                "movl %4, %%ebx                        \n\t"
1366
                                YSCALEYUV2RGB1b
1367
                                WRITEBGR24
1368
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1369
                        "m" (yalpha1), "m" (uvalpha1)
1370
                        : "%eax", "%ebx"
1371
                        );
1372
                }
1373
                else if(dstFormat==IMGFMT_BGR15)
1374
                {
1375
                        asm volatile(
1376
                                YSCALEYUV2RGB1b
1377
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1378
#ifdef DITHER1XBPP
1379
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1380
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1381
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1382
#endif
1383
                                WRITEBGR15
1384
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1385
                        "m" (yalpha1), "m" (uvalpha1)
1386
                        : "%eax"
1387
                        );
1388
                }
1389
                else if(dstFormat==IMGFMT_BGR16)
1390
                {
1391
                        asm volatile(
1392
                                YSCALEYUV2RGB1b
1393
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1394
#ifdef DITHER1XBPP
1395
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1396
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1397
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1398
#endif
1399

    
1400
                                WRITEBGR16
1401
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1402
                        "m" (yalpha1), "m" (uvalpha1)
1403
                        : "%eax"
1404
                        );
1405
                }
1406
        }
1407
#else
1408
//FIXME write 2 versions (for even & odd lines)
1409

    
1410
        if(dstFormat==IMGFMT_BGR32)
1411
        {
1412
                int i;
1413
#ifdef WORDS_BIGENDIAN
1414
                dest++;
1415
#endif
1416
                for(i=0; i<dstW-1; i+=2){
1417
                        // vertical linear interpolation && yuv2rgb in a single step:
1418
                        int Y1=yuvtab_2568[buf0[i]>>7];
1419
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1420
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1421
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1422

    
1423
                        int Cb= yuvtab_40cf[U];
1424
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1425
                        int Cr= yuvtab_3343[V];
1426

    
1427
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1428
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1429
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1430

    
1431
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1432
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1433
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1434
                }
1435
        }
1436
        else if(dstFormat==IMGFMT_BGR24)
1437
        {
1438
                int i;
1439
                for(i=0; i<dstW-1; i+=2){
1440
                        // vertical linear interpolation && yuv2rgb in a single step:
1441
                        int Y1=yuvtab_2568[buf0[i]>>7];
1442
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1443
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1444
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1445

    
1446
                        int Cb= yuvtab_40cf[U];
1447
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1448
                        int Cr= yuvtab_3343[V];
1449

    
1450
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1451
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1452
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1453

    
1454
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1455
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1456
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1457
                        dest+=6;
1458
                }
1459
        }
1460
        else if(dstFormat==IMGFMT_BGR16)
1461
        {
1462
                int i;
1463
#ifdef DITHER1XBPP
1464
                static int ditherb1=1<<14;
1465
                static int ditherg1=1<<13;
1466
                static int ditherr1=2<<14;
1467
                static int ditherb2=3<<14;
1468
                static int ditherg2=3<<13;
1469
                static int ditherr2=0<<14;
1470

    
1471
                ditherb1 ^= (1^2)<<14;
1472
                ditherg1 ^= (1^2)<<13;
1473
                ditherr1 ^= (1^2)<<14;
1474
                ditherb2 ^= (3^0)<<14;
1475
                ditherg2 ^= (3^0)<<13;
1476
                ditherr2 ^= (3^0)<<14;
1477
#else
1478
                const int ditherb1=0;
1479
                const int ditherg1=0;
1480
                const int ditherr1=0;
1481
                const int ditherb2=0;
1482
                const int ditherg2=0;
1483
                const int ditherr2=0;
1484
#endif
1485
                for(i=0; i<dstW-1; i+=2){
1486
                        // vertical linear interpolation && yuv2rgb in a single step:
1487
                        int Y1=yuvtab_2568[buf0[i]>>7];
1488
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1489
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1490
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1491

    
1492
                        int Cb= yuvtab_40cf[U];
1493
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1494
                        int Cr= yuvtab_3343[V];
1495

    
1496
                        ((uint16_t*)dest)[i] =
1497
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1498
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1499
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
1500

    
1501
                        ((uint16_t*)dest)[i+1] =
1502
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1503
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1504
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
1505
                }
1506
        }
1507
        else if(dstFormat==IMGFMT_BGR15)
1508
        {
1509
                int i;
1510
#ifdef DITHER1XBPP
1511
                static int ditherb1=1<<14;
1512
                static int ditherg1=1<<14;
1513
                static int ditherr1=2<<14;
1514
                static int ditherb2=3<<14;
1515
                static int ditherg2=3<<14;
1516
                static int ditherr2=0<<14;
1517

    
1518
                ditherb1 ^= (1^2)<<14;
1519
                ditherg1 ^= (1^2)<<14;
1520
                ditherr1 ^= (1^2)<<14;
1521
                ditherb2 ^= (3^0)<<14;
1522
                ditherg2 ^= (3^0)<<14;
1523
                ditherr2 ^= (3^0)<<14;
1524
#else
1525
                const int ditherb1=0;
1526
                const int ditherg1=0;
1527
                const int ditherr1=0;
1528
                const int ditherb2=0;
1529
                const int ditherg2=0;
1530
                const int ditherr2=0;
1531
#endif
1532
                for(i=0; i<dstW-1; i+=2){
1533
                        // vertical linear interpolation && yuv2rgb in a single step:
1534
                        int Y1=yuvtab_2568[buf0[i]>>7];
1535
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1536
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1537
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1538

    
1539
                        int Cb= yuvtab_40cf[U];
1540
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1541
                        int Cr= yuvtab_3343[V];
1542

    
1543
                        ((uint16_t*)dest)[i] =
1544
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1545
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1546
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
1547

    
1548
                        ((uint16_t*)dest)[i+1] =
1549
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1550
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1551
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
1552
                }
1553
        }
1554
#endif
1555
}
1556

    
1557
//FIXME yuy2* can read upto 7 samples to much
1558

    
1559
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1560
{
1561
#ifdef HAVE_MMX
1562
        asm volatile(
1563
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1564
                "movl %0, %%eax                        \n\t"
1565
                "1:                                \n\t"
1566
                "movq (%1, %%eax,2), %%mm0        \n\t"
1567
                "movq 8(%1, %%eax,2), %%mm1        \n\t"
1568
                "pand %%mm2, %%mm0                \n\t"
1569
                "pand %%mm2, %%mm1                \n\t"
1570
                "packuswb %%mm1, %%mm0                \n\t"
1571
                "movq %%mm0, (%2, %%eax)        \n\t"
1572
                "addl $8, %%eax                        \n\t"
1573
                " js 1b                                \n\t"
1574
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1575
                : "%eax"
1576
        );
1577
#else
1578
        int i;
1579
        for(i=0; i<width; i++)
1580
                dst[i]= src[2*i];
1581
#endif
1582
}
1583

    
1584
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1585
{
1586
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1587
        asm volatile(
1588
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1589
                "movl %0, %%eax                        \n\t"
1590
                "1:                                \n\t"
1591
                "movq (%1, %%eax,4), %%mm0        \n\t"
1592
                "movq 8(%1, %%eax,4), %%mm1        \n\t"
1593
                "movq (%2, %%eax,4), %%mm2        \n\t"
1594
                "movq 8(%2, %%eax,4), %%mm3        \n\t"
1595
                PAVGB(%%mm2, %%mm0)
1596
                PAVGB(%%mm3, %%mm1)
1597
                "psrlw $8, %%mm0                \n\t"
1598
                "psrlw $8, %%mm1                \n\t"
1599
                "packuswb %%mm1, %%mm0                \n\t"
1600
                "movq %%mm0, %%mm1                \n\t"
1601
                "psrlw $8, %%mm0                \n\t"
1602
                "pand %%mm4, %%mm1                \n\t"
1603
                "packuswb %%mm0, %%mm0                \n\t"
1604
                "packuswb %%mm1, %%mm1                \n\t"
1605
                "movd %%mm0, (%4, %%eax)        \n\t"
1606
                "movd %%mm1, (%3, %%eax)        \n\t"
1607
                "addl $4, %%eax                        \n\t"
1608
                " js 1b                                \n\t"
1609
                : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1610
                : "%eax"
1611
        );
1612
#else
1613
        int i;
1614
        for(i=0; i<width; i++)
1615
        {
1616
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1617
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1618
        }
1619
#endif
1620
}
1621

    
1622
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1623
{
1624
#ifdef HAVE_MMXFIXME
1625
#else
1626
        int i;
1627
        for(i=0; i<width; i++)
1628
        {
1629
                int b= src[i*4+0];
1630
                int g= src[i*4+1];
1631
                int r= src[i*4+2];
1632

    
1633
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1634
        }
1635
#endif
1636
}
1637

    
1638
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1639
{
1640
#ifdef HAVE_MMXFIXME
1641
#else
1642
        int i;
1643
        for(i=0; i<width; i++)
1644
        {
1645
                int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1646
                int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1647
                int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1648

    
1649
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1650
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1651
        }
1652
#endif
1653
}
1654

    
1655
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1656
{
1657
#ifdef HAVE_MMX
1658
        asm volatile(
1659
                "movl %2, %%eax                        \n\t"
1660
                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
1661
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1662
                "pxor %%mm7, %%mm7                \n\t"
1663
                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1664
                ".balign 16                        \n\t"
1665
                "1:                                \n\t"
1666
                PREFETCH" 64(%0, %%ebx)                \n\t"
1667
                "movd (%0, %%ebx), %%mm0        \n\t"
1668
                "movd 3(%0, %%ebx), %%mm1        \n\t"
1669
                "punpcklbw %%mm7, %%mm0                \n\t"
1670
                "punpcklbw %%mm7, %%mm1                \n\t"
1671
                "movd 6(%0, %%ebx), %%mm2        \n\t"
1672
                "movd 9(%0, %%ebx), %%mm3        \n\t"
1673
                "punpcklbw %%mm7, %%mm2                \n\t"
1674
                "punpcklbw %%mm7, %%mm3                \n\t"
1675
                "pmaddwd %%mm6, %%mm0                \n\t"
1676
                "pmaddwd %%mm6, %%mm1                \n\t"
1677
                "pmaddwd %%mm6, %%mm2                \n\t"
1678
                "pmaddwd %%mm6, %%mm3                \n\t"
1679
#ifndef FAST_BGR2YV12
1680
                "psrad $8, %%mm0                \n\t"
1681
                "psrad $8, %%mm1                \n\t"
1682
                "psrad $8, %%mm2                \n\t"
1683
                "psrad $8, %%mm3                \n\t"
1684
#endif
1685
                "packssdw %%mm1, %%mm0                \n\t"
1686
                "packssdw %%mm3, %%mm2                \n\t"
1687
                "pmaddwd %%mm5, %%mm0                \n\t"
1688
                "pmaddwd %%mm5, %%mm2                \n\t"
1689
                "packssdw %%mm2, %%mm0                \n\t"
1690
                "psraw $7, %%mm0                \n\t"
1691

    
1692
                "movd 12(%0, %%ebx), %%mm4        \n\t"
1693
                "movd 15(%0, %%ebx), %%mm1        \n\t"
1694
                "punpcklbw %%mm7, %%mm4                \n\t"
1695
                "punpcklbw %%mm7, %%mm1                \n\t"
1696
                "movd 18(%0, %%ebx), %%mm2        \n\t"
1697
                "movd 21(%0, %%ebx), %%mm3        \n\t"
1698
                "punpcklbw %%mm7, %%mm2                \n\t"
1699
                "punpcklbw %%mm7, %%mm3                \n\t"
1700
                "pmaddwd %%mm6, %%mm4                \n\t"
1701
                "pmaddwd %%mm6, %%mm1                \n\t"
1702
                "pmaddwd %%mm6, %%mm2                \n\t"
1703
                "pmaddwd %%mm6, %%mm3                \n\t"
1704
#ifndef FAST_BGR2YV12
1705
                "psrad $8, %%mm4                \n\t"
1706
                "psrad $8, %%mm1                \n\t"
1707
                "psrad $8, %%mm2                \n\t"
1708
                "psrad $8, %%mm3                \n\t"
1709
#endif
1710
                "packssdw %%mm1, %%mm4                \n\t"
1711
                "packssdw %%mm3, %%mm2                \n\t"
1712
                "pmaddwd %%mm5, %%mm4                \n\t"
1713
                "pmaddwd %%mm5, %%mm2                \n\t"
1714
                "addl $24, %%ebx                \n\t"
1715
                "packssdw %%mm2, %%mm4                \n\t"
1716
                "psraw $7, %%mm4                \n\t"
1717

    
1718
                "packuswb %%mm4, %%mm0                \n\t"
1719
                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
1720

    
1721
                "movq %%mm0, (%1, %%eax)        \n\t"
1722
                "addl $8, %%eax                        \n\t"
1723
                " js 1b                                \n\t"
1724
                : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1725
                : "%eax", "%ebx"
1726
        );
1727
#else
1728
        int i;
1729
        for(i=0; i<width; i++)
1730
        {
1731
                int b= src[i*3+0];
1732
                int g= src[i*3+1];
1733
                int r= src[i*3+2];
1734

    
1735
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1736
        }
1737
#endif
1738
}
1739

    
1740
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1741
{
1742
#ifdef HAVE_MMX
1743
        asm volatile(
1744
                "movl %4, %%eax                        \n\t"
1745
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1746
                "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
1747
                "pxor %%mm7, %%mm7                \n\t"
1748
                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1749
                "addl %%ebx, %%ebx                \n\t"
1750
                ".balign 16                        \n\t"
1751
                "1:                                \n\t"
1752
                PREFETCH" 64(%0, %%ebx)                \n\t"
1753
                PREFETCH" 64(%1, %%ebx)                \n\t"
1754
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1755
                "movq (%0, %%ebx), %%mm0        \n\t"
1756
                "movq (%1, %%ebx), %%mm1        \n\t"
1757
                "movq 6(%0, %%ebx), %%mm2        \n\t"
1758
                "movq 6(%1, %%ebx), %%mm3        \n\t"
1759
                PAVGB(%%mm1, %%mm0)
1760
                PAVGB(%%mm3, %%mm2)
1761
                "movq %%mm0, %%mm1                \n\t"
1762
                "movq %%mm2, %%mm3                \n\t"
1763
                "psrlq $24, %%mm0                \n\t"
1764
                "psrlq $24, %%mm2                \n\t"
1765
                PAVGB(%%mm1, %%mm0)
1766
                PAVGB(%%mm3, %%mm2)
1767
                "punpcklbw %%mm7, %%mm0                \n\t"
1768
                "punpcklbw %%mm7, %%mm2                \n\t"
1769
#else
1770
                "movd (%0, %%ebx), %%mm0        \n\t"
1771
                "movd (%1, %%ebx), %%mm1        \n\t"
1772
                "movd 3(%0, %%ebx), %%mm2        \n\t"
1773
                "movd 3(%1, %%ebx), %%mm3        \n\t"
1774
                "punpcklbw %%mm7, %%mm0                \n\t"
1775
                "punpcklbw %%mm7, %%mm1                \n\t"
1776
                "punpcklbw %%mm7, %%mm2                \n\t"
1777
                "punpcklbw %%mm7, %%mm3                \n\t"
1778
                "paddw %%mm1, %%mm0                \n\t"
1779
                "paddw %%mm3, %%mm2                \n\t"
1780
                "paddw %%mm2, %%mm0                \n\t"
1781
                "movd 6(%0, %%ebx), %%mm4        \n\t"
1782
                "movd 6(%1, %%ebx), %%mm1        \n\t"
1783
                "movd 9(%0, %%ebx), %%mm2        \n\t"
1784
                "movd 9(%1, %%ebx), %%mm3        \n\t"
1785
                "punpcklbw %%mm7, %%mm4                \n\t"
1786
                "punpcklbw %%mm7, %%mm1                \n\t"
1787
                "punpcklbw %%mm7, %%mm2                \n\t"
1788
                "punpcklbw %%mm7, %%mm3                \n\t"
1789
                "paddw %%mm1, %%mm4                \n\t"
1790
                "paddw %%mm3, %%mm2                \n\t"
1791
                "paddw %%mm4, %%mm2                \n\t"
1792
                "psrlw $2, %%mm0                \n\t"
1793
                "psrlw $2, %%mm2                \n\t"
1794
#endif
1795
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1796
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1797
                
1798
                "pmaddwd %%mm0, %%mm1                \n\t"
1799
                "pmaddwd %%mm2, %%mm3                \n\t"
1800
                "pmaddwd %%mm6, %%mm0                \n\t"
1801
                "pmaddwd %%mm6, %%mm2                \n\t"
1802
#ifndef FAST_BGR2YV12
1803
                "psrad $8, %%mm0                \n\t"
1804
                "psrad $8, %%mm1                \n\t"
1805
                "psrad $8, %%mm2                \n\t"
1806
                "psrad $8, %%mm3                \n\t"
1807
#endif
1808
                "packssdw %%mm2, %%mm0                \n\t"
1809
                "packssdw %%mm3, %%mm1                \n\t"
1810
                "pmaddwd %%mm5, %%mm0                \n\t"
1811
                "pmaddwd %%mm5, %%mm1                \n\t"
1812
                "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
1813
                "psraw $7, %%mm0                \n\t"
1814

    
1815
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1816
                "movq 12(%0, %%ebx), %%mm4        \n\t"
1817
                "movq 12(%1, %%ebx), %%mm1        \n\t"
1818
                "movq 18(%0, %%ebx), %%mm2        \n\t"
1819
                "movq 18(%1, %%ebx), %%mm3        \n\t"
1820
                PAVGB(%%mm1, %%mm4)
1821
                PAVGB(%%mm3, %%mm2)
1822
                "movq %%mm4, %%mm1                \n\t"
1823
                "movq %%mm2, %%mm3                \n\t"
1824
                "psrlq $24, %%mm4                \n\t"
1825
                "psrlq $24, %%mm2                \n\t"
1826
                PAVGB(%%mm1, %%mm4)
1827
                PAVGB(%%mm3, %%mm2)
1828
                "punpcklbw %%mm7, %%mm4                \n\t"
1829
                "punpcklbw %%mm7, %%mm2                \n\t"
1830
#else
1831
                "movd 12(%0, %%ebx), %%mm4        \n\t"
1832
                "movd 12(%1, %%ebx), %%mm1        \n\t"
1833
                "movd 15(%0, %%ebx), %%mm2        \n\t"
1834
                "movd 15(%1, %%ebx), %%mm3        \n\t"
1835
                "punpcklbw %%mm7, %%mm4                \n\t"
1836
                "punpcklbw %%mm7, %%mm1                \n\t"
1837
                "punpcklbw %%mm7, %%mm2                \n\t"
1838
                "punpcklbw %%mm7, %%mm3                \n\t"
1839
                "paddw %%mm1, %%mm4                \n\t"
1840
                "paddw %%mm3, %%mm2                \n\t"
1841
                "paddw %%mm2, %%mm4                \n\t"
1842
                "movd 18(%0, %%ebx), %%mm5        \n\t"
1843
                "movd 18(%1, %%ebx), %%mm1        \n\t"
1844
                "movd 21(%0, %%ebx), %%mm2        \n\t"
1845
                "movd 21(%1, %%ebx), %%mm3        \n\t"
1846
                "punpcklbw %%mm7, %%mm5                \n\t"
1847
                "punpcklbw %%mm7, %%mm1                \n\t"
1848
                "punpcklbw %%mm7, %%mm2                \n\t"
1849
                "punpcklbw %%mm7, %%mm3                \n\t"
1850
                "paddw %%mm1, %%mm5                \n\t"
1851
                "paddw %%mm3, %%mm2                \n\t"
1852
                "paddw %%mm5, %%mm2                \n\t"
1853
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1854
                "psrlw $2, %%mm4                \n\t"
1855
                "psrlw $2, %%mm2                \n\t"
1856
#endif
1857
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1858
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1859
                
1860
                "pmaddwd %%mm4, %%mm1                \n\t"
1861
                "pmaddwd %%mm2, %%mm3                \n\t"
1862
                "pmaddwd %%mm6, %%mm4                \n\t"
1863
                "pmaddwd %%mm6, %%mm2                \n\t"
1864
#ifndef FAST_BGR2YV12
1865
                "psrad $8, %%mm4                \n\t"
1866
                "psrad $8, %%mm1                \n\t"
1867
                "psrad $8, %%mm2                \n\t"
1868
                "psrad $8, %%mm3                \n\t"
1869
#endif
1870
                "packssdw %%mm2, %%mm4                \n\t"
1871
                "packssdw %%mm3, %%mm1                \n\t"
1872
                "pmaddwd %%mm5, %%mm4                \n\t"
1873
                "pmaddwd %%mm5, %%mm1                \n\t"
1874
                "addl $24, %%ebx                \n\t"
1875
                "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
1876
                "psraw $7, %%mm4                \n\t"
1877
                
1878
                "movq %%mm0, %%mm1                \n\t"
1879
                "punpckldq %%mm4, %%mm0                \n\t"
1880
                "punpckhdq %%mm4, %%mm1                \n\t"
1881
                "packsswb %%mm1, %%mm0                \n\t"
1882
                "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
1883

    
1884
                "movd %%mm0, (%2, %%eax)        \n\t"
1885
                "punpckhdq %%mm0, %%mm0                \n\t"
1886
                "movd %%mm0, (%3, %%eax)        \n\t"
1887
                "addl $4, %%eax                        \n\t"
1888
                " js 1b                                \n\t"
1889
                : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1890
                : "%eax", "%ebx"
1891
        );
1892
#else
1893
        int i;
1894
        for(i=0; i<width; i++)
1895
        {
1896
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1897
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1898
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1899

    
1900
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1901
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1902
        }
1903
#endif
1904
}
1905

    
1906
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1907
{
1908
        int i;
1909
        for(i=0; i<width; i++)
1910
        {
1911
                int d= src[i*2] + (src[i*2+1]<<8);
1912
                int b= d&0x1F;
1913
                int g= (d>>5)&0x3F;
1914
                int r= (d>>11)&0x1F;
1915

    
1916
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1917
        }
1918
}
1919

    
1920
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1921
{
1922
        int i;
1923
        for(i=0; i<width; i++)
1924
        {
1925
#if 1
1926
                int d0= le2me_32( ((uint32_t*)src1)[i] );
1927
                int d1= le2me_32( ((uint32_t*)src2)[i] );
1928
                
1929
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1930
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1931

    
1932
                int dh2= (dh>>11) + (dh<<21);
1933
                int d= dh2 + dl;
1934

    
1935
                int b= d&0x7F;
1936
                int r= (d>>11)&0x7F;
1937
                int g= d>>21;
1938
#else
1939
                int d0= src1[i*4] + (src1[i*4+1]<<8);
1940
                int b0= d0&0x1F;
1941
                int g0= (d0>>5)&0x3F;
1942
                int r0= (d0>>11)&0x1F;
1943

    
1944
                int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1945
                int b1= d1&0x1F;
1946
                int g1= (d1>>5)&0x3F;
1947
                int r1= (d1>>11)&0x1F;
1948

    
1949
                int d2= src2[i*4] + (src2[i*4+1]<<8);
1950
                int b2= d2&0x1F;
1951
                int g2= (d2>>5)&0x3F;
1952
                int r2= (d2>>11)&0x1F;
1953

    
1954
                int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1955
                int b3= d3&0x1F;
1956
                int g3= (d3>>5)&0x3F;
1957
                int r3= (d3>>11)&0x1F;
1958

    
1959
                int b= b0 + b1 + b2 + b3;
1960
                int g= g0 + g1 + g2 + g3;
1961
                int r= r0 + r1 + r2 + r3;
1962
#endif
1963
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1964
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1965
        }
1966
}
1967

    
1968
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1969
{
1970
        int i;
1971
        for(i=0; i<width; i++)
1972
        {
1973
                int d= src[i*2] + (src[i*2+1]<<8);
1974
                int b= d&0x1F;
1975
                int g= (d>>5)&0x1F;
1976
                int r= (d>>10)&0x1F;
1977

    
1978
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1979
        }
1980
}
1981

    
1982
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1983
{
1984
        int i;
1985
        for(i=0; i<width; i++)
1986
        {
1987
#if 1
1988
                int d0= le2me_32( ((uint32_t*)src1)[i] );
1989
                int d1= le2me_32( ((uint32_t*)src2)[i] );
1990
                
1991
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1992
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1993

    
1994
                int dh2= (dh>>11) + (dh<<21);
1995
                int d= dh2 + dl;
1996

    
1997
                int b= d&0x7F;
1998
                int r= (d>>10)&0x7F;
1999
                int g= d>>21;
2000
#else
2001
                int d0= src1[i*4] + (src1[i*4+1]<<8);
2002
                int b0= d0&0x1F;
2003
                int g0= (d0>>5)&0x1F;
2004
                int r0= (d0>>10)&0x1F;
2005

    
2006
                int d1= src1[i*4+2] + (src1[i*4+3]<<8);
2007
                int b1= d1&0x1F;
2008
                int g1= (d1>>5)&0x1F;
2009
                int r1= (d1>>10)&0x1F;
2010

    
2011
                int d2= src2[i*4] + (src2[i*4+1]<<8);
2012
                int b2= d2&0x1F;
2013
                int g2= (d2>>5)&0x1F;
2014
                int r2= (d2>>10)&0x1F;
2015

    
2016
                int d3= src2[i*4+2] + (src2[i*4+3]<<8);
2017
                int b3= d3&0x1F;
2018
                int g3= (d3>>5)&0x1F;
2019
                int r3= (d3>>10)&0x1F;
2020

    
2021
                int b= b0 + b1 + b2 + b3;
2022
                int g= g0 + g1 + g2 + g3;
2023
                int r= r0 + r1 + r2 + r3;
2024
#endif
2025
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2026
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2027
        }
2028
}
2029

    
2030

    
2031
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2032
{
2033
        int i;
2034
        for(i=0; i<width; i++)
2035
        {
2036
                int r= src[i*4+0];
2037
                int g= src[i*4+1];
2038
                int b= src[i*4+2];
2039

    
2040
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2041
        }
2042
}
2043

    
2044
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2045
{
2046
        int i;
2047
        for(i=0; i<width; i++)
2048
        {
2049
                int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
2050
                int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
2051
                int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
2052

    
2053
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2054
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2055
        }
2056
}
2057

    
2058
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2059
{
2060
        int i;
2061
        for(i=0; i<width; i++)
2062
        {
2063
                int r= src[i*3+0];
2064
                int g= src[i*3+1];
2065
                int b= src[i*3+2];
2066

    
2067
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2068
        }
2069
}
2070

    
2071
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2072
{
2073
        int i;
2074
        for(i=0; i<width; i++)
2075
        {
2076
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2077
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2078
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2079

    
2080
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2081
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2082
        }
2083
}
2084

    
2085

    
2086
// Bilinear / Bicubic scaling
2087
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2088
                                  int16_t *filter, int16_t *filterPos, int filterSize)
2089
{
2090
#ifdef HAVE_MMX
2091
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2092
        {
2093
                int counter= -2*dstW;
2094
                filter-= counter*2;
2095
                filterPos-= counter/2;
2096
                dst-= counter/2;
2097
                asm volatile(
2098
                        "pxor %%mm7, %%mm7                \n\t"
2099
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2100
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
2101
                        "movl %%eax, %%ebp                \n\t"
2102
                        ".balign 16                        \n\t"
2103
                        "1:                                \n\t"
2104
                        "movzwl (%2, %%ebp), %%eax        \n\t"
2105
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
2106
                        "movq (%1, %%ebp, 4), %%mm1        \n\t"
2107
                        "movq 8(%1, %%ebp, 4), %%mm3        \n\t"
2108
                        "movd (%3, %%eax), %%mm0        \n\t"
2109
                        "movd (%3, %%ebx), %%mm2        \n\t"
2110
                        "punpcklbw %%mm7, %%mm0                \n\t"
2111
                        "punpcklbw %%mm7, %%mm2                \n\t"
2112
                        "pmaddwd %%mm1, %%mm0                \n\t"
2113
                        "pmaddwd %%mm2, %%mm3                \n\t"
2114
                        "psrad $8, %%mm0                \n\t"
2115
                        "psrad $8, %%mm3                \n\t"
2116
                        "packssdw %%mm3, %%mm0                \n\t"
2117
                        "pmaddwd %%mm6, %%mm0                \n\t"
2118
                        "packssdw %%mm0, %%mm0                \n\t"
2119
                        "movd %%mm0, (%4, %%ebp)        \n\t"
2120
                        "addl $4, %%ebp                        \n\t"
2121
                        " jnc 1b                        \n\t"
2122

    
2123
                        "popl %%ebp                        \n\t"
2124
                        : "+a" (counter)
2125
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2126
                        : "%ebx"
2127
                );
2128
        }
2129
        else if(filterSize==8)
2130
        {
2131
                int counter= -2*dstW;
2132
                filter-= counter*4;
2133
                filterPos-= counter/2;
2134
                dst-= counter/2;
2135
                asm volatile(
2136
                        "pxor %%mm7, %%mm7                \n\t"
2137
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2138
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
2139
                        "movl %%eax, %%ebp                \n\t"
2140
                        ".balign 16                        \n\t"
2141
                        "1:                                \n\t"
2142
                        "movzwl (%2, %%ebp), %%eax        \n\t"
2143
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
2144
                        "movq (%1, %%ebp, 8), %%mm1        \n\t"
2145
                        "movq 16(%1, %%ebp, 8), %%mm3        \n\t"
2146
                        "movd (%3, %%eax), %%mm0        \n\t"
2147
                        "movd (%3, %%ebx), %%mm2        \n\t"
2148
                        "punpcklbw %%mm7, %%mm0                \n\t"
2149
                        "punpcklbw %%mm7, %%mm2                \n\t"
2150
                        "pmaddwd %%mm1, %%mm0                \n\t"
2151
                        "pmaddwd %%mm2, %%mm3                \n\t"
2152

    
2153
                        "movq 8(%1, %%ebp, 8), %%mm1        \n\t"
2154
                        "movq 24(%1, %%ebp, 8), %%mm5        \n\t"
2155
                        "movd 4(%3, %%eax), %%mm4        \n\t"
2156
                        "movd 4(%3, %%ebx), %%mm2        \n\t"
2157
                        "punpcklbw %%mm7, %%mm4                \n\t"
2158
                        "punpcklbw %%mm7, %%mm2                \n\t"
2159
                        "pmaddwd %%mm1, %%mm4                \n\t"
2160
                        "pmaddwd %%mm2, %%mm5                \n\t"
2161
                        "paddd %%mm4, %%mm0                \n\t"
2162
                        "paddd %%mm5, %%mm3                \n\t"
2163
                                                
2164
                        "psrad $8, %%mm0                \n\t"
2165
                        "psrad $8, %%mm3                \n\t"
2166
                        "packssdw %%mm3, %%mm0                \n\t"
2167
                        "pmaddwd %%mm6, %%mm0                \n\t"
2168
                        "packssdw %%mm0, %%mm0                \n\t"
2169
                        "movd %%mm0, (%4, %%ebp)        \n\t"
2170
                        "addl $4, %%ebp                        \n\t"
2171
                        " jnc 1b                        \n\t"
2172

    
2173
                        "popl %%ebp                        \n\t"
2174
                        : "+a" (counter)
2175
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2176
                        : "%ebx"
2177
                );
2178
        }
2179
        else
2180
        {
2181
                int counter= -2*dstW;
2182
//                filter-= counter*filterSize/2;
2183
                filterPos-= counter/2;
2184
                dst-= counter/2;
2185
                asm volatile(
2186
                        "pxor %%mm7, %%mm7                \n\t"
2187
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2188
                        ".balign 16                        \n\t"
2189
                        "1:                                \n\t"
2190
                        "movl %2, %%ecx                        \n\t"
2191
                        "movzwl (%%ecx, %0), %%eax        \n\t"
2192
                        "movzwl 2(%%ecx, %0), %%ebx        \n\t"
2193
                        "movl %5, %%ecx                        \n\t"
2194
                        "pxor %%mm4, %%mm4                \n\t"
2195
                        "pxor %%mm5, %%mm5                \n\t"
2196
                        "2:                                \n\t"
2197
                        "movq (%1), %%mm1                \n\t"
2198
                        "movq (%1, %6), %%mm3                \n\t"
2199
                        "movd (%%ecx, %%eax), %%mm0        \n\t"
2200
                        "movd (%%ecx, %%ebx), %%mm2        \n\t"
2201
                        "punpcklbw %%mm7, %%mm0                \n\t"
2202
                        "punpcklbw %%mm7, %%mm2                \n\t"
2203
                        "pmaddwd %%mm1, %%mm0                \n\t"
2204
                        "pmaddwd %%mm2, %%mm3                \n\t"
2205
                        "paddd %%mm3, %%mm5                \n\t"
2206
                        "paddd %%mm0, %%mm4                \n\t"
2207
                        "addl $8, %1                        \n\t"
2208
                        "addl $4, %%ecx                        \n\t"
2209
                        "cmpl %4, %%ecx                        \n\t"
2210
                        " jb 2b                                \n\t"
2211
                        "addl %6, %1                        \n\t"
2212
                        "psrad $8, %%mm4                \n\t"
2213
                        "psrad $8, %%mm5                \n\t"
2214
                        "packssdw %%mm5, %%mm4                \n\t"
2215
                        "pmaddwd %%mm6, %%mm4                \n\t"
2216
                        "packssdw %%mm4, %%mm4                \n\t"
2217
                        "movl %3, %%eax                        \n\t"
2218
                        "movd %%mm4, (%%eax, %0)        \n\t"
2219
                        "addl $4, %0                        \n\t"
2220
                        " jnc 1b                        \n\t"
2221

    
2222
                        : "+r" (counter), "+r" (filter)
2223
                        : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2224
                          "m" (src), "r" (filterSize*2)
2225
                        : "%ebx", "%eax", "%ecx"
2226
                );
2227
        }
2228
#else
2229
        int i;
2230
        for(i=0; i<dstW; i++)
2231
        {
2232
                int j;
2233
                int srcPos= filterPos[i];
2234
                int val=0;
2235
//                printf("filterPos: %d\n", filterPos[i]);
2236
                for(j=0; j<filterSize; j++)
2237
                {
2238
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2239
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2240
                }
2241
//                filter += hFilterSize;
2242
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2243
//                dst[i] = val>>7;
2244
        }
2245
#endif
2246
}
2247
      // *** horizontal scale Y line to temp buffer
2248
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2249
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2250
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2251
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2252
                                   int32_t *mmx2FilterPos)
2253
{
2254
    if(srcFormat==IMGFMT_YUY2)
2255
    {
2256
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2257
        src= formatConvBuffer;
2258
    }
2259
    else if(srcFormat==IMGFMT_BGR32)
2260
    {
2261
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2262
        src= formatConvBuffer;
2263
    }
2264
    else if(srcFormat==IMGFMT_BGR24)
2265
    {
2266
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2267
        src= formatConvBuffer;
2268
    }
2269
    else if(srcFormat==IMGFMT_BGR16)
2270
    {
2271
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2272
        src= formatConvBuffer;
2273
    }
2274
    else if(srcFormat==IMGFMT_BGR15)
2275
    {
2276
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2277
        src= formatConvBuffer;
2278
    }
2279
    else if(srcFormat==IMGFMT_RGB32)
2280
    {
2281
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2282
        src= formatConvBuffer;
2283
    }
2284
    else if(srcFormat==IMGFMT_RGB24)
2285
    {
2286
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2287
        src= formatConvBuffer;
2288
    }
2289

    
2290
#ifdef HAVE_MMX
2291
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2292
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2293
#else
2294
    if(!(flags&SWS_FAST_BILINEAR))
2295
#endif
2296
    {
2297
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2298
    }
2299
    else // Fast Bilinear upscale / crap downscale
2300
    {
2301
#ifdef ARCH_X86
2302
#ifdef HAVE_MMX2
2303
        int i;
2304
        if(canMMX2BeUsed)
2305
        {
2306
                asm volatile(
2307
                        "pxor %%mm7, %%mm7                \n\t"
2308
                        "movl %0, %%ecx                        \n\t"
2309
                        "movl %1, %%edi                        \n\t"
2310
                        "movl %2, %%edx                        \n\t"
2311
                        "movl %3, %%ebx                        \n\t"
2312
                        "xorl %%eax, %%eax                \n\t" // i
2313
                        PREFETCH" (%%ecx)                \n\t"
2314
                        PREFETCH" 32(%%ecx)                \n\t"
2315
                        PREFETCH" 64(%%ecx)                \n\t"
2316

    
2317
#define FUNNY_Y_CODE \
2318
                        "movl (%%ebx), %%esi                \n\t"\
2319
                        "call *%4                        \n\t"\
2320
                        "addl (%%ebx, %%eax), %%ecx        \n\t"\
2321
                        "addl %%eax, %%edi                \n\t"\
2322
                        "xorl %%eax, %%eax                \n\t"\
2323

    
2324
FUNNY_Y_CODE
2325
FUNNY_Y_CODE
2326
FUNNY_Y_CODE
2327
FUNNY_Y_CODE
2328
FUNNY_Y_CODE
2329
FUNNY_Y_CODE
2330
FUNNY_Y_CODE
2331
FUNNY_Y_CODE
2332

    
2333
                        :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2334
                        "m" (funnyYCode)
2335
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2336
                );
2337
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2338
        }
2339
        else
2340
        {
2341
#endif
2342
        //NO MMX just normal asm ...
2343
        asm volatile(
2344
                "xorl %%eax, %%eax                \n\t" // i
2345
                "xorl %%ebx, %%ebx                \n\t" // xx
2346
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2347
                ".balign 16                        \n\t"
2348
                "1:                                \n\t"
2349
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
2350
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
2351
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2352
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2353
                "shll $16, %%edi                \n\t"
2354
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2355
                "movl %1, %%edi                        \n\t"
2356
                "shrl $9, %%esi                        \n\t"
2357
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
2358
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2359
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2360

    
2361
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
2362
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
2363
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2364
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2365
                "shll $16, %%edi                \n\t"
2366
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2367
                "movl %1, %%edi                        \n\t"
2368
                "shrl $9, %%esi                        \n\t"
2369
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
2370
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2371
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2372

    
2373

    
2374
                "addl $2, %%eax                        \n\t"
2375
                "cmpl %2, %%eax                        \n\t"
2376
                " jb 1b                                \n\t"
2377

    
2378

    
2379
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2380
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2381
                );
2382
#ifdef HAVE_MMX2
2383
        } //if MMX2 cant be used
2384
#endif
2385
#else
2386
        int i;
2387
        unsigned int xpos=0;
2388
        for(i=0;i<dstWidth;i++)
2389
        {
2390
                register unsigned int xx=xpos>>16;
2391
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2392
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2393
                xpos+=xInc;
2394
        }
2395
#endif
2396
    }
2397
}
2398

    
2399
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2400
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2401
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2402
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2403
                                   int32_t *mmx2FilterPos)
2404
{
2405
    if(srcFormat==IMGFMT_YUY2)
2406
    {
2407
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2408
        src1= formatConvBuffer;
2409
        src2= formatConvBuffer+2048;
2410
    }
2411
    else if(srcFormat==IMGFMT_BGR32)
2412
    {
2413
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2414
        src1= formatConvBuffer;
2415
        src2= formatConvBuffer+2048;
2416
    }
2417
    else if(srcFormat==IMGFMT_BGR24)
2418
    {
2419
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2420
        src1= formatConvBuffer;
2421
        src2= formatConvBuffer+2048;
2422
    }
2423
    else if(srcFormat==IMGFMT_BGR16)
2424
    {
2425
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2426
        src1= formatConvBuffer;
2427
        src2= formatConvBuffer+2048;
2428
    }
2429
    else if(srcFormat==IMGFMT_BGR15)
2430
    {
2431
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2432
        src1= formatConvBuffer;
2433
        src2= formatConvBuffer+2048;
2434
    }
2435
    else if(srcFormat==IMGFMT_RGB32)
2436
    {
2437
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2438
        src1= formatConvBuffer;
2439
        src2= formatConvBuffer+2048;
2440
    }
2441
    else if(srcFormat==IMGFMT_RGB24)
2442
    {
2443
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2444
        src1= formatConvBuffer;
2445
        src2= formatConvBuffer+2048;
2446
    }
2447
    else if(isGray(srcFormat))
2448
    {
2449
            return;
2450
    }
2451

    
2452
#ifdef HAVE_MMX
2453
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2454
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2455
#else
2456
    if(!(flags&SWS_FAST_BILINEAR))
2457
#endif
2458
    {
2459
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2460
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2461
    }
2462
    else // Fast Bilinear upscale / crap downscale
2463
    {
2464
#ifdef ARCH_X86
2465
#ifdef HAVE_MMX2
2466
        int i;
2467
        if(canMMX2BeUsed)
2468
        {
2469
                asm volatile(
2470
                        "pxor %%mm7, %%mm7                \n\t"
2471
                        "movl %0, %%ecx                        \n\t"
2472
                        "movl %1, %%edi                        \n\t"
2473
                        "movl %2, %%edx                        \n\t"
2474
                        "movl %3, %%ebx                        \n\t"
2475
                        "xorl %%eax, %%eax                \n\t" // i
2476
                        PREFETCH" (%%ecx)                \n\t"
2477
                        PREFETCH" 32(%%ecx)                \n\t"
2478
                        PREFETCH" 64(%%ecx)                \n\t"
2479

    
2480
#define FUNNY_UV_CODE \
2481
                        "movl (%%ebx), %%esi                \n\t"\
2482
                        "call *%4                        \n\t"\
2483
                        "addl (%%ebx, %%eax), %%ecx        \n\t"\
2484
                        "addl %%eax, %%edi                \n\t"\
2485
                        "xorl %%eax, %%eax                \n\t"\
2486

    
2487
FUNNY_UV_CODE
2488
FUNNY_UV_CODE
2489
FUNNY_UV_CODE
2490
FUNNY_UV_CODE
2491
                        "xorl %%eax, %%eax                \n\t" // i
2492
                        "movl %5, %%ecx                        \n\t" // src
2493
                        "movl %1, %%edi                        \n\t" // buf1
2494
                        "addl $4096, %%edi                \n\t"
2495
                        PREFETCH" (%%ecx)                \n\t"
2496
                        PREFETCH" 32(%%ecx)                \n\t"
2497
                        PREFETCH" 64(%%ecx)                \n\t"
2498

    
2499
FUNNY_UV_CODE
2500
FUNNY_UV_CODE
2501
FUNNY_UV_CODE
2502
FUNNY_UV_CODE
2503

    
2504
                        :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2505
                        "m" (funnyUVCode), "m" (src2)
2506
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2507
                );
2508
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2509
                {
2510
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2511
                        dst[i] = src1[srcW-1]*128;
2512
                        dst[i+2048] = src2[srcW-1]*128;
2513
                }
2514
        }
2515
        else
2516
        {
2517
#endif
2518
        asm volatile(
2519
                "xorl %%eax, %%eax                \n\t" // i
2520
                "xorl %%ebx, %%ebx                \n\t" // xx
2521
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2522
                ".balign 16                        \n\t"
2523
                "1:                                \n\t"
2524
                "movl %0, %%esi                        \n\t"
2525
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
2526
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
2527
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2528
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2529
                "shll $16, %%edi                \n\t"
2530
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2531
                "movl %1, %%edi                        \n\t"
2532
                "shrl $9, %%esi                        \n\t"
2533
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
2534

    
2535
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
2536
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
2537
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2538
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2539
                "shll $16, %%edi                \n\t"
2540
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2541
                "movl %1, %%edi                        \n\t"
2542
                "shrl $9, %%esi                        \n\t"
2543
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2544

    
2545
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2546
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2547
                "addl $1, %%eax                        \n\t"
2548
                "cmpl %2, %%eax                        \n\t"
2549
                " jb 1b                                \n\t"
2550

    
2551
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2552
                "r" (src2)
2553
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2554
                );
2555
#ifdef HAVE_MMX2
2556
        } //if MMX2 cant be used
2557
#endif
2558
#else
2559
        int i;
2560
        unsigned int xpos=0;
2561
        for(i=0;i<dstWidth;i++)
2562
        {
2563
                register unsigned int xx=xpos>>16;
2564
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2565
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2566
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2567
/* slower
2568
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2569
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2570
*/
2571
                xpos+=xInc;
2572
        }
2573
#endif
2574
   }
2575
}
2576

    
2577
static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2578
             int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2579

    
2580
        /* load a few things into local vars to make the code more readable? and faster */
2581
        const int srcW= c->srcW;
2582
        const int dstW= c->dstW;
2583
        const int dstH= c->dstH;
2584
        const int chrDstW= c->chrDstW;
2585
        const int chrSrcW= c->chrSrcW;
2586
        const int lumXInc= c->lumXInc;
2587
        const int chrXInc= c->chrXInc;
2588
        const int dstFormat= c->dstFormat;
2589
        const int srcFormat= c->srcFormat;
2590
        const int flags= c->flags;
2591
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2592
        int16_t *vLumFilterPos= c->vLumFilterPos;
2593
        int16_t *vChrFilterPos= c->vChrFilterPos;
2594
        int16_t *hLumFilterPos= c->hLumFilterPos;
2595
        int16_t *hChrFilterPos= c->hChrFilterPos;
2596
        int16_t *vLumFilter= c->vLumFilter;
2597
        int16_t *vChrFilter= c->vChrFilter;
2598
        int16_t *hLumFilter= c->hLumFilter;
2599
        int16_t *hChrFilter= c->hChrFilter;
2600
        int16_t *lumMmxFilter= c->lumMmxFilter;
2601
        int16_t *chrMmxFilter= c->chrMmxFilter;
2602
        const int vLumFilterSize= c->vLumFilterSize;
2603
        const int vChrFilterSize= c->vChrFilterSize;
2604
        const int hLumFilterSize= c->hLumFilterSize;
2605
        const int hChrFilterSize= c->hChrFilterSize;
2606
        int16_t **lumPixBuf= c->lumPixBuf;
2607
        int16_t **chrPixBuf= c->chrPixBuf;
2608
        const int vLumBufSize= c->vLumBufSize;
2609
        const int vChrBufSize= c->vChrBufSize;
2610
        uint8_t *funnyYCode= c->funnyYCode;
2611
        uint8_t *funnyUVCode= c->funnyUVCode;
2612
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2613
        const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2614
        const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2615

    
2616
        /* vars whch will change and which we need to storw back in the context */
2617
        int dstY= c->dstY;
2618
        int lumBufIndex= c->lumBufIndex;
2619
        int chrBufIndex= c->chrBufIndex;
2620
        int lastInLumBuf= c->lastInLumBuf;
2621
        int lastInChrBuf= c->lastInChrBuf;
2622
        int srcStride[3];
2623
        int dstStride[3];
2624
        uint8_t *src[3];
2625
        uint8_t *dst[3];
2626

    
2627
        if(c->srcFormat == IMGFMT_I420){
2628
                src[0]= srcParam[0];
2629
                src[1]= srcParam[2];
2630
                src[2]= srcParam[1];
2631
                srcStride[0]= srcStrideParam[0];
2632
                srcStride[1]= srcStrideParam[2];
2633
                srcStride[2]= srcStrideParam[1];
2634
        }
2635
        else if(c->srcFormat==IMGFMT_YV12 || c->srcFormat==IMGFMT_YVU9){
2636
                src[0]= srcParam[0];
2637
                src[1]= srcParam[1];
2638
                src[2]= srcParam[2];
2639
                srcStride[0]= srcStrideParam[0];
2640
                srcStride[1]= srcStrideParam[1];
2641
                srcStride[2]= srcStrideParam[2];
2642
        }
2643
        else if(isPacked(c->srcFormat)){
2644
                src[0]=
2645
                src[1]=
2646
                src[2]= srcParam[0];
2647
                srcStride[0]= srcStrideParam[0];
2648
                srcStride[1]=
2649
                srcStride[2]= srcStrideParam[0]<<1;
2650
        }
2651
        else if(isGray(c->srcFormat)){
2652
                src[0]= srcParam[0];
2653
                src[1]=
2654
                src[2]= NULL;
2655
                srcStride[0]= srcStrideParam[0];
2656
                srcStride[1]=
2657
                srcStride[2]= 0;
2658
        }
2659

    
2660
        if(dstFormat == IMGFMT_I420){
2661
                dst[0]= dstParam[0];
2662
                dst[1]= dstParam[2];
2663
                dst[2]= dstParam[1];
2664
                dstStride[0]= dstStrideParam[0];
2665
                dstStride[1]= dstStrideParam[2];
2666
                dstStride[2]= dstStrideParam[1];
2667
        }else{
2668
                dst[0]= dstParam[0];
2669
                dst[1]= dstParam[1];
2670
                dst[2]= dstParam[2];
2671
                dstStride[0]= dstStrideParam[0];
2672
                dstStride[1]= dstStrideParam[1];
2673
                dstStride[2]= dstStrideParam[2];
2674
        }
2675
        
2676
//        printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2677
//                (int)dst[0], (int)dst[1], (int)dst[2]);
2678

    
2679
#if 0 //self test FIXME move to a vfilter or something
2680
{
2681
static volatile int i=0;
2682
i++;
2683
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2684
        selfTest(src, srcStride, c->srcW, c->srcH);
2685
i--;
2686
}
2687
#endif
2688

    
2689
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2690
//dstStride[0],dstStride[1],dstStride[2]);
2691

    
2692
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2693
        {
2694
                static int firstTime=1; //FIXME move this into the context perhaps
2695
                if(flags & SWS_PRINT_INFO && firstTime)
2696
                {
2697
                        mp_msg(MSGT_SWS,MSGL_WARN,"SwScaler: Warning: dstStride is not aligned!\n"
2698
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2699
                        firstTime=0;
2700
                }
2701
        }
2702

    
2703
        /* Note the user might start scaling the picture in the middle so this will not get executed
2704
           this is not really intended but works currently, so ppl might do it */
2705
        if(srcSliceY ==0){
2706
                lumBufIndex=0;
2707
                chrBufIndex=0;
2708
                dstY=0;        
2709
                lastInLumBuf= -1;
2710
                lastInChrBuf= -1;
2711
        }
2712

    
2713
        for(;dstY < dstH; dstY++){
2714
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2715
                const int chrDstY= dstY>>c->chrDstVSubSample;
2716
                unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2717
                unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2718

    
2719
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2720
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2721
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2722
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2723

    
2724
                //handle holes (FAST_BILINEAR & weird filters)
2725
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2726
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2727
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2728
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2729
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2730

    
2731
                // Do we have enough lines in this slice to output the dstY line
2732
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2733
                {
2734
                        //Do horizontal scaling
2735
                        while(lastInLumBuf < lastLumSrcY)
2736
                        {
2737
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2738
                                lumBufIndex++;
2739
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2740
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2741
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2742
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2743
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2744
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2745
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2746
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2747
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2748
                                lastInLumBuf++;
2749
                        }
2750
                        while(lastInChrBuf < lastChrSrcY)
2751
                        {
2752
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2753
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2754
                                chrBufIndex++;
2755
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2756
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2757
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2758
                                //FIXME replace parameters through context struct (some at least)
2759

    
2760
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2761
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2762
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2763
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2764
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2765
                                lastInChrBuf++;
2766
                        }
2767
                        //wrap buf index around to stay inside the ring buffer
2768
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2769
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2770
                }
2771
                else // not enough lines left in this slice -> load the rest in the buffer
2772
                {
2773
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2774
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2775
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2776
                        vChrBufSize, vLumBufSize);*/
2777

    
2778
                        //Do horizontal scaling
2779
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2780
                        {
2781
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2782
                                lumBufIndex++;
2783
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2784
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2785
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2786
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2787
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2788
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2789
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2790
                                lastInLumBuf++;
2791
                        }
2792
                        while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2793
                        {
2794
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2795
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2796
                                chrBufIndex++;
2797
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2798
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2799
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2800

    
2801
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2802
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2803
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2804
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2805
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2806
                                lastInChrBuf++;
2807
                        }
2808
                        //wrap buf index around to stay inside the ring buffer
2809
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2810
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2811
                        break; //we cant output a dstY line so lets try with the next slice
2812
                }
2813

    
2814
#ifdef HAVE_MMX
2815
                b5Dither= dither8[dstY&1];
2816
                g6Dither= dither4[dstY&1];
2817
                g5Dither= dither8[dstY&1];
2818
                r5Dither= dither8[(dstY+1)&1];
2819
#endif
2820
            if(dstY < dstH-2)
2821
            {
2822
                if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
2823
                {
2824
                        if((dstY&1) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2825
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2826
                        {
2827
                                int16_t *lumBuf = lumPixBuf[0];
2828
                                int16_t *chrBuf= chrPixBuf[0];
2829
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2830
                        }
2831
                        else //General YV12
2832
                        {
2833
                                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2834
                                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2835
                                RENAME(yuv2yuvX)(
2836
                                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2837
                                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2838
                                        dest, uDest, vDest, dstW, chrDstW,
2839
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+chrDstY*vChrFilterSize*4);
2840
                        }
2841
                }
2842
                else
2843
                {
2844
                        int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2845
                        int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2846

    
2847
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2848
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2849
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2850
                        {
2851
                                int chrAlpha= vChrFilter[2*dstY+1];
2852

    
2853
                                RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2854
                                                 dest, dstW, chrAlpha, dstFormat, flags);
2855
                        }
2856
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2857
                        {
2858
                                int lumAlpha= vLumFilter[2*dstY+1];
2859
                                int chrAlpha= vChrFilter[2*dstY+1];
2860

    
2861
                                RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2862
                                                 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
2863
                        }
2864
                        else //General RGB
2865
                        {
2866
                                RENAME(yuv2rgbX)(
2867
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2868
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2869
                                        dest, dstW, dstFormat,
2870
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2871
                        }
2872
                }
2873
            }
2874
            else // hmm looks like we cant use MMX here without overwriting this arrays tail
2875
            {
2876
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2877
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2878
                if(isPlanarYUV(dstFormat)) //YV12
2879
                {
2880
                        if(dstY&1) uDest=vDest= NULL;
2881
                        yuv2yuvXinC(c, 
2882
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2883
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2884
                                dest, uDest, vDest);
2885
                }
2886
                else
2887
                {
2888
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2889
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2890
                        yuv2rgbXinC(
2891
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893
                                dest, dstW, dstFormat);
2894
                }
2895
            }
2896
        }
2897

    
2898
#ifdef HAVE_MMX
2899
        __asm __volatile(SFENCE:::"memory");
2900
        __asm __volatile(EMMS:::"memory");
2901
#endif
2902
        /* store changed local vars back in the context */
2903
        c->dstY= dstY;
2904
        c->lumBufIndex= lumBufIndex;
2905
        c->chrBufIndex= chrBufIndex;
2906
        c->lastInLumBuf= lastInLumBuf;
2907
        c->lastInChrBuf= lastInChrBuf;
2908
}