Statistics
| Branch: | Revision:

ffmpeg / postproc / swscale_template.c @ df3c183a

History | View | Annotate | Download (85.2 KB)

1
/*
2
    Copyright (C) 2001-2002 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
*/
18

    
19
#undef MOVNTQ
20
#undef PAVGB
21
#undef PREFETCH
22
#undef PREFETCHW
23
#undef EMMS
24
#undef SFENCE
25

    
26
#ifdef HAVE_3DNOW
27
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
28
#define EMMS     "femms"
29
#else
30
#define EMMS     "emms"
31
#endif
32

    
33
#ifdef HAVE_3DNOW
34
#define PREFETCH  "prefetch"
35
#define PREFETCHW "prefetchw"
36
#elif defined ( HAVE_MMX2 )
37
#define PREFETCH "prefetchnta"
38
#define PREFETCHW "prefetcht0"
39
#else
40
#define PREFETCH "/nop"
41
#define PREFETCHW "/nop"
42
#endif
43

    
44
#ifdef HAVE_MMX2
45
#define SFENCE "sfence"
46
#else
47
#define SFENCE "/nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
52
#elif defined (HAVE_3DNOW)
53
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
58
#else
59
#define MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
60
#endif
61

    
62

    
63
#define YSCALEYUV2YV12X(x) \
64
                        "xorl %%eax, %%eax                \n\t"\
65
                        "pxor %%mm3, %%mm3                \n\t"\
66
                        "pxor %%mm4, %%mm4                \n\t"\
67
                        "movl %0, %%edx                        \n\t"\
68
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
69
                        "1:                                \n\t"\
70
                        "movl (%1, %%edx, 4), %%esi        \n\t"\
71
                        "movq (%2, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
72
                        "movq " #x "(%%esi, %%eax, 2), %%mm2        \n\t" /* srcData */\
73
                        "movq 8+" #x "(%%esi, %%eax, 2), %%mm5        \n\t" /* srcData */\
74
                        "pmulhw %%mm0, %%mm2                \n\t"\
75
                        "pmulhw %%mm0, %%mm5                \n\t"\
76
                        "paddw %%mm2, %%mm3                \n\t"\
77
                        "paddw %%mm5, %%mm4                \n\t"\
78
                        "addl $1, %%edx                        \n\t"\
79
                        " jnz 1b                        \n\t"\
80
                        "psraw $3, %%mm3                \n\t"\
81
                        "psraw $3, %%mm4                \n\t"\
82
                        "packuswb %%mm4, %%mm3                \n\t"\
83
                        MOVNTQ(%%mm3, (%3, %%eax))\
84
                        "addl $8, %%eax                        \n\t"\
85
                        "cmpl %4, %%eax                        \n\t"\
86
                        "pxor %%mm3, %%mm3                \n\t"\
87
                        "pxor %%mm4, %%mm4                \n\t"\
88
                        "movl %0, %%edx                        \n\t"\
89
                        "jb 1b                                \n\t"
90

    
91
#define YSCALEYUV2YV121 \
92
                        "movl %2, %%eax                        \n\t"\
93
                        ".balign 16                        \n\t" /* FIXME Unroll? */\
94
                        "1:                                \n\t"\
95
                        "movq (%0, %%eax, 2), %%mm0        \n\t"\
96
                        "movq 8(%0, %%eax, 2), %%mm1        \n\t"\
97
                        "psraw $7, %%mm0                \n\t"\
98
                        "psraw $7, %%mm1                \n\t"\
99
                        "packuswb %%mm1, %%mm0                \n\t"\
100
                        MOVNTQ(%%mm0, (%1, %%eax))\
101
                        "addl $8, %%eax                        \n\t"\
102
                        "jnc 1b                                \n\t"
103

    
104
/*
105
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
106
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
107
                           "r" (dest), "m" (dstW),
108
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
109
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
110
*/
111
#define YSCALEYUV2RGBX \
112
                "xorl %%eax, %%eax                \n\t"\
113
                ".balign 16                        \n\t"\
114
                "1:                                \n\t"\
115
                "movl %1, %%edx                        \n\t" /* -chrFilterSize */\
116
                "movl %3, %%ebx                        \n\t" /* chrMmxFilter+lumFilterSize */\
117
                "movl %7, %%ecx                        \n\t" /* chrSrc+lumFilterSize */\
118
                "pxor %%mm3, %%mm3                \n\t"\
119
                "pxor %%mm4, %%mm4                \n\t"\
120
                "2:                                \n\t"\
121
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
122
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
123
                "movq (%%esi, %%eax), %%mm2        \n\t" /* UsrcData */\
124
                "movq 4096(%%esi, %%eax), %%mm5        \n\t" /* VsrcData */\
125
                "pmulhw %%mm0, %%mm2                \n\t"\
126
                "pmulhw %%mm0, %%mm5                \n\t"\
127
                "paddw %%mm2, %%mm3                \n\t"\
128
                "paddw %%mm5, %%mm4                \n\t"\
129
                "addl $1, %%edx                        \n\t"\
130
                " jnz 2b                        \n\t"\
131
\
132
                "movl %0, %%edx                        \n\t" /* -lumFilterSize */\
133
                "movl %2, %%ebx                        \n\t" /* lumMmxFilter+lumFilterSize */\
134
                "movl %6, %%ecx                        \n\t" /* lumSrc+lumFilterSize */\
135
                "pxor %%mm1, %%mm1                \n\t"\
136
                "pxor %%mm7, %%mm7                \n\t"\
137
                "2:                                \n\t"\
138
                "movl (%%ecx, %%edx, 4), %%esi        \n\t"\
139
                "movq (%%ebx, %%edx, 8), %%mm0        \n\t" /* filterCoeff */\
140
                "movq (%%esi, %%eax, 2), %%mm2        \n\t" /* Y1srcData */\
141
                "movq 8(%%esi, %%eax, 2), %%mm5        \n\t" /* Y2srcData */\
142
                "pmulhw %%mm0, %%mm2                \n\t"\
143
                "pmulhw %%mm0, %%mm5                \n\t"\
144
                "paddw %%mm2, %%mm1                \n\t"\
145
                "paddw %%mm5, %%mm7                \n\t"\
146
                "addl $1, %%edx                        \n\t"\
147
                " jnz 2b                        \n\t"\
148
\
149
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
150
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
151
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
152
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
153
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
154
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
155
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
156
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
157
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
158
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
159
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
160
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
161
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
162
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
163
                "paddw %%mm3, %%mm4                \n\t"\
164
                "movq %%mm2, %%mm0                \n\t"\
165
                "movq %%mm5, %%mm6                \n\t"\
166
                "movq %%mm4, %%mm3                \n\t"\
167
                "punpcklwd %%mm2, %%mm2                \n\t"\
168
                "punpcklwd %%mm5, %%mm5                \n\t"\
169
                "punpcklwd %%mm4, %%mm4                \n\t"\
170
                "paddw %%mm1, %%mm2                \n\t"\
171
                "paddw %%mm1, %%mm5                \n\t"\
172
                "paddw %%mm1, %%mm4                \n\t"\
173
                "punpckhwd %%mm0, %%mm0                \n\t"\
174
                "punpckhwd %%mm6, %%mm6                \n\t"\
175
                "punpckhwd %%mm3, %%mm3                \n\t"\
176
                "paddw %%mm7, %%mm0                \n\t"\
177
                "paddw %%mm7, %%mm6                \n\t"\
178
                "paddw %%mm7, %%mm3                \n\t"\
179
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
180
                "packuswb %%mm0, %%mm2                \n\t"\
181
                "packuswb %%mm6, %%mm5                \n\t"\
182
                "packuswb %%mm3, %%mm4                \n\t"\
183
                "pxor %%mm7, %%mm7                \n\t"
184

    
185
#define FULL_YSCALEYUV2RGB \
186
                "pxor %%mm7, %%mm7                \n\t"\
187
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
188
                "punpcklwd %%mm6, %%mm6                \n\t"\
189
                "punpcklwd %%mm6, %%mm6                \n\t"\
190
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
191
                "punpcklwd %%mm5, %%mm5                \n\t"\
192
                "punpcklwd %%mm5, %%mm5                \n\t"\
193
                "xorl %%eax, %%eax                \n\t"\
194
                ".balign 16                        \n\t"\
195
                "1:                                \n\t"\
196
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
197
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
198
                "movq (%2, %%eax,2), %%mm2        \n\t" /* uvbuf0[eax]*/\
199
                "movq (%3, %%eax,2), %%mm3        \n\t" /* uvbuf1[eax]*/\
200
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
201
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
202
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
203
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
204
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
205
                "movq 4096(%2, %%eax,2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
206
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
207
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
208
                "movq 4096(%3, %%eax,2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
209
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
210
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
211
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
212
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
213
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
214
\
215
\
216
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
217
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
218
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
219
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
220
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
221
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
222
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
223
\
224
\
225
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
226
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
227
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
228
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
229
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
230
                "packuswb %%mm3, %%mm3                \n\t"\
231
\
232
                "packuswb %%mm0, %%mm0                \n\t"\
233
                "paddw %%mm4, %%mm2                \n\t"\
234
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
235
\
236
                "packuswb %%mm1, %%mm1                \n\t"
237

    
238
#define YSCALEYUV2RGB \
239
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
240
                "punpcklwd %%mm6, %%mm6                \n\t"\
241
                "punpcklwd %%mm6, %%mm6                \n\t"\
242
                "movq %%mm6, "MANGLE(asm_yalpha1)"\n\t"\
243
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
244
                "punpcklwd %%mm5, %%mm5                \n\t"\
245
                "punpcklwd %%mm5, %%mm5                \n\t"\
246
                "movq %%mm5, "MANGLE(asm_uvalpha1)"\n\t"\
247
                "xorl %%eax, %%eax                \n\t"\
248
                ".balign 16                        \n\t"\
249
                "1:                                \n\t"\
250
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
251
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
252
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
253
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
254
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
255
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
256
                "movq "MANGLE(asm_uvalpha1)", %%mm0\n\t"\
257
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
258
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
259
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
260
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
261
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
262
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
263
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
264
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
265
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
266
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
267
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
268
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
269
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
270
                "movq (%0, %%eax, 2), %%mm0        \n\t" /*buf0[eax]*/\
271
                "movq (%1, %%eax, 2), %%mm1        \n\t" /*buf1[eax]*/\
272
                "movq 8(%0, %%eax, 2), %%mm6        \n\t" /*buf0[eax]*/\
273
                "movq 8(%1, %%eax, 2), %%mm7        \n\t" /*buf1[eax]*/\
274
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
275
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
276
                "pmulhw "MANGLE(asm_yalpha1)", %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
277
                "pmulhw "MANGLE(asm_yalpha1)", %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
278
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
279
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
280
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
281
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
282
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
283
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
284
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
285
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
286
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
287
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
288
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
289
                "paddw %%mm3, %%mm4                \n\t"\
290
                "movq %%mm2, %%mm0                \n\t"\
291
                "movq %%mm5, %%mm6                \n\t"\
292
                "movq %%mm4, %%mm3                \n\t"\
293
                "punpcklwd %%mm2, %%mm2                \n\t"\
294
                "punpcklwd %%mm5, %%mm5                \n\t"\
295
                "punpcklwd %%mm4, %%mm4                \n\t"\
296
                "paddw %%mm1, %%mm2                \n\t"\
297
                "paddw %%mm1, %%mm5                \n\t"\
298
                "paddw %%mm1, %%mm4                \n\t"\
299
                "punpckhwd %%mm0, %%mm0                \n\t"\
300
                "punpckhwd %%mm6, %%mm6                \n\t"\
301
                "punpckhwd %%mm3, %%mm3                \n\t"\
302
                "paddw %%mm7, %%mm0                \n\t"\
303
                "paddw %%mm7, %%mm6                \n\t"\
304
                "paddw %%mm7, %%mm3                \n\t"\
305
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
306
                "packuswb %%mm0, %%mm2                \n\t"\
307
                "packuswb %%mm6, %%mm5                \n\t"\
308
                "packuswb %%mm3, %%mm4                \n\t"\
309
                "pxor %%mm7, %%mm7                \n\t"
310

    
311
#define YSCALEYUV2RGB1 \
312
                "xorl %%eax, %%eax                \n\t"\
313
                ".balign 16                        \n\t"\
314
                "1:                                \n\t"\
315
                "movq (%2, %%eax), %%mm3        \n\t" /* uvbuf0[eax]*/\
316
                "movq 4096(%2, %%eax), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
317
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
318
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
319
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
320
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
321
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
322
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
323
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
324
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
325
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
326
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
327
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
328
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
329
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
330
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
331
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
332
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
333
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
334
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
335
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
336
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
337
                "paddw %%mm3, %%mm4                \n\t"\
338
                "movq %%mm2, %%mm0                \n\t"\
339
                "movq %%mm5, %%mm6                \n\t"\
340
                "movq %%mm4, %%mm3                \n\t"\
341
                "punpcklwd %%mm2, %%mm2                \n\t"\
342
                "punpcklwd %%mm5, %%mm5                \n\t"\
343
                "punpcklwd %%mm4, %%mm4                \n\t"\
344
                "paddw %%mm1, %%mm2                \n\t"\
345
                "paddw %%mm1, %%mm5                \n\t"\
346
                "paddw %%mm1, %%mm4                \n\t"\
347
                "punpckhwd %%mm0, %%mm0                \n\t"\
348
                "punpckhwd %%mm6, %%mm6                \n\t"\
349
                "punpckhwd %%mm3, %%mm3                \n\t"\
350
                "paddw %%mm7, %%mm0                \n\t"\
351
                "paddw %%mm7, %%mm6                \n\t"\
352
                "paddw %%mm7, %%mm3                \n\t"\
353
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
354
                "packuswb %%mm0, %%mm2                \n\t"\
355
                "packuswb %%mm6, %%mm5                \n\t"\
356
                "packuswb %%mm3, %%mm4                \n\t"\
357
                "pxor %%mm7, %%mm7                \n\t"
358

    
359
// do vertical chrominance interpolation
360
#define YSCALEYUV2RGB1b \
361
                "xorl %%eax, %%eax                \n\t"\
362
                ".balign 16                        \n\t"\
363
                "1:                                \n\t"\
364
                "movq (%2, %%eax), %%mm2        \n\t" /* uvbuf0[eax]*/\
365
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
366
                "movq 4096(%2, %%eax), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
367
                "movq 4096(%3, %%eax), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
368
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
369
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
370
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
371
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
372
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* (U-128)8*/\
373
                "psubw "MANGLE(w400)", %%mm4        \n\t" /* (V-128)8*/\
374
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
375
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
376
                "pmulhw "MANGLE(ugCoeff)", %%mm3\n\t"\
377
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
378
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
379
                "movq (%0, %%eax, 2), %%mm1        \n\t" /*buf0[eax]*/\
380
                "movq 8(%0, %%eax, 2), %%mm7        \n\t" /*buf0[eax]*/\
381
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
382
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
383
                "pmulhw "MANGLE(ubCoeff)", %%mm2\n\t"\
384
                "pmulhw "MANGLE(vrCoeff)", %%mm5\n\t"\
385
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
386
                "psubw "MANGLE(w80)", %%mm7        \n\t" /* 8(Y-16)*/\
387
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
388
                "pmulhw "MANGLE(yCoeff)", %%mm7        \n\t"\
389
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
390
                "paddw %%mm3, %%mm4                \n\t"\
391
                "movq %%mm2, %%mm0                \n\t"\
392
                "movq %%mm5, %%mm6                \n\t"\
393
                "movq %%mm4, %%mm3                \n\t"\
394
                "punpcklwd %%mm2, %%mm2                \n\t"\
395
                "punpcklwd %%mm5, %%mm5                \n\t"\
396
                "punpcklwd %%mm4, %%mm4                \n\t"\
397
                "paddw %%mm1, %%mm2                \n\t"\
398
                "paddw %%mm1, %%mm5                \n\t"\
399
                "paddw %%mm1, %%mm4                \n\t"\
400
                "punpckhwd %%mm0, %%mm0                \n\t"\
401
                "punpckhwd %%mm6, %%mm6                \n\t"\
402
                "punpckhwd %%mm3, %%mm3                \n\t"\
403
                "paddw %%mm7, %%mm0                \n\t"\
404
                "paddw %%mm7, %%mm6                \n\t"\
405
                "paddw %%mm7, %%mm3                \n\t"\
406
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
407
                "packuswb %%mm0, %%mm2                \n\t"\
408
                "packuswb %%mm6, %%mm5                \n\t"\
409
                "packuswb %%mm3, %%mm4                \n\t"\
410
                "pxor %%mm7, %%mm7                \n\t"
411

    
412
#define WRITEBGR32 \
413
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
414
                        "movq %%mm2, %%mm1                \n\t" /* B */\
415
                        "movq %%mm5, %%mm6                \n\t" /* R */\
416
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
417
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
418
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
419
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
420
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
421
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
422
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
423
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
424
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
425
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
426
\
427
                        MOVNTQ(%%mm0, (%4, %%eax, 4))\
428
                        MOVNTQ(%%mm2, 8(%4, %%eax, 4))\
429
                        MOVNTQ(%%mm1, 16(%4, %%eax, 4))\
430
                        MOVNTQ(%%mm3, 24(%4, %%eax, 4))\
431
\
432
                        "addl $8, %%eax                        \n\t"\
433
                        "cmpl %5, %%eax                        \n\t"\
434
                        " jb 1b                                \n\t"
435

    
436
#define WRITEBGR16 \
437
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
438
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
439
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
440
                        "psrlq $3, %%mm2                \n\t"\
441
\
442
                        "movq %%mm2, %%mm1                \n\t"\
443
                        "movq %%mm4, %%mm3                \n\t"\
444
\
445
                        "punpcklbw %%mm7, %%mm3                \n\t"\
446
                        "punpcklbw %%mm5, %%mm2                \n\t"\
447
                        "punpckhbw %%mm7, %%mm4                \n\t"\
448
                        "punpckhbw %%mm5, %%mm1                \n\t"\
449
\
450
                        "psllq $3, %%mm3                \n\t"\
451
                        "psllq $3, %%mm4                \n\t"\
452
\
453
                        "por %%mm3, %%mm2                \n\t"\
454
                        "por %%mm4, %%mm1                \n\t"\
455
\
456
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
457
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
458
\
459
                        "addl $8, %%eax                        \n\t"\
460
                        "cmpl %5, %%eax                        \n\t"\
461
                        " jb 1b                                \n\t"
462

    
463
#define WRITEBGR15 \
464
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
465
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
466
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
467
                        "psrlq $3, %%mm2                \n\t"\
468
                        "psrlq $1, %%mm5                \n\t"\
469
\
470
                        "movq %%mm2, %%mm1                \n\t"\
471
                        "movq %%mm4, %%mm3                \n\t"\
472
\
473
                        "punpcklbw %%mm7, %%mm3                \n\t"\
474
                        "punpcklbw %%mm5, %%mm2                \n\t"\
475
                        "punpckhbw %%mm7, %%mm4                \n\t"\
476
                        "punpckhbw %%mm5, %%mm1                \n\t"\
477
\
478
                        "psllq $2, %%mm3                \n\t"\
479
                        "psllq $2, %%mm4                \n\t"\
480
\
481
                        "por %%mm3, %%mm2                \n\t"\
482
                        "por %%mm4, %%mm1                \n\t"\
483
\
484
                        MOVNTQ(%%mm2, (%4, %%eax, 2))\
485
                        MOVNTQ(%%mm1, 8(%4, %%eax, 2))\
486
\
487
                        "addl $8, %%eax                        \n\t"\
488
                        "cmpl %5, %%eax                        \n\t"\
489
                        " jb 1b                                \n\t"
490

    
491
#define WRITEBGR24OLD \
492
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
493
                        "movq %%mm2, %%mm1                \n\t" /* B */\
494
                        "movq %%mm5, %%mm6                \n\t" /* R */\
495
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
496
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
497
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
498
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
499
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
500
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
501
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
502
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
503
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
504
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
505
\
506
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
507
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
508
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
509
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
510
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
511
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
512
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
513
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
514
\
515
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
516
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
517
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
518
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
519
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
520
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
521
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
522
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
523
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
524
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
525
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
526
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
527
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
528
\
529
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
530
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
531
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
532
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
533
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
534
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
535
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
536
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
537
\
538
                        MOVNTQ(%%mm0, (%%ebx))\
539
                        MOVNTQ(%%mm2, 8(%%ebx))\
540
                        MOVNTQ(%%mm3, 16(%%ebx))\
541
                        "addl $24, %%ebx                \n\t"\
542
\
543
                        "addl $8, %%eax                        \n\t"\
544
                        "cmpl %5, %%eax                        \n\t"\
545
                        " jb 1b                                \n\t"
546

    
547
#define WRITEBGR24MMX \
548
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
549
                        "movq %%mm2, %%mm1                \n\t" /* B */\
550
                        "movq %%mm5, %%mm6                \n\t" /* R */\
551
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
552
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
553
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
554
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
555
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
556
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
557
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
558
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
559
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
560
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
561
\
562
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
563
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
564
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
565
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
566
\
567
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
568
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
569
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
570
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
571
\
572
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
573
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
574
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
575
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
576
\
577
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
578
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
579
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
580
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
581
                        MOVNTQ(%%mm0, (%%ebx))\
582
\
583
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
584
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
585
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
586
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
587
                        MOVNTQ(%%mm6, 8(%%ebx))\
588
\
589
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
590
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
591
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
592
                        MOVNTQ(%%mm5, 16(%%ebx))\
593
\
594
                        "addl $24, %%ebx                \n\t"\
595
\
596
                        "addl $8, %%eax                        \n\t"\
597
                        "cmpl %5, %%eax                        \n\t"\
598
                        " jb 1b                                \n\t"
599

    
600
#define WRITEBGR24MMX2 \
601
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
602
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
603
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
604
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
605
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
606
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
607
\
608
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
609
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
610
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
611
\
612
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
613
                        "por %%mm1, %%mm6                \n\t"\
614
                        "por %%mm3, %%mm6                \n\t"\
615
                        MOVNTQ(%%mm6, (%%ebx))\
616
\
617
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
618
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
619
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
620
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
621
\
622
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
623
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
624
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
625
\
626
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
627
                        "por %%mm3, %%mm6                \n\t"\
628
                        MOVNTQ(%%mm6, 8(%%ebx))\
629
\
630
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
631
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
632
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
633
\
634
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
635
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
636
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
637
\
638
                        "por %%mm1, %%mm3                \n\t"\
639
                        "por %%mm3, %%mm6                \n\t"\
640
                        MOVNTQ(%%mm6, 16(%%ebx))\
641
\
642
                        "addl $24, %%ebx                \n\t"\
643
\
644
                        "addl $8, %%eax                        \n\t"\
645
                        "cmpl %5, %%eax                        \n\t"\
646
                        " jb 1b                                \n\t"
647

    
648
#ifdef HAVE_MMX2
649
#undef WRITEBGR24
650
#define WRITEBGR24 WRITEBGR24MMX2
651
#else
652
#undef WRITEBGR24
653
#define WRITEBGR24 WRITEBGR24MMX
654
#endif
655

    
656
static inline void RENAME(yuv2yuvX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
657
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
658
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW,
659
                                    int16_t * lumMmxFilter, int16_t * chrMmxFilter)
660
{
661
#ifdef HAVE_MMX
662
        if(uDest != NULL)
663
        {
664
                asm volatile(
665
                                YSCALEYUV2YV12X(0)
666
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
667
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (uDest), "m" (dstW>>1)
668
                                : "%eax", "%edx", "%esi"
669
                        );
670

    
671
                asm volatile(
672
                                YSCALEYUV2YV12X(4096)
673
                                :: "m" (-chrFilterSize), "r" (chrSrc+chrFilterSize),
674
                                "r" (chrMmxFilter+chrFilterSize*4), "r" (vDest), "m" (dstW>>1)
675
                                : "%eax", "%edx", "%esi"
676
                        );
677
        }
678

    
679
        asm volatile(
680
                        YSCALEYUV2YV12X(0)
681
                        :: "m" (-lumFilterSize), "r" (lumSrc+lumFilterSize),
682
                           "r" (lumMmxFilter+lumFilterSize*4), "r" (dest), "m" (dstW)
683
                        : "%eax", "%edx", "%esi"
684
                );
685
#else
686
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
687
            chrFilter, chrSrc, chrFilterSize,
688
            dest, uDest, vDest, dstW);
689
#endif
690
}
691

    
692
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
693
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, int dstW)
694
{
695
#ifdef HAVE_MMX
696
        if(uDest != NULL)
697
        {
698
                asm volatile(
699
                                YSCALEYUV2YV121
700
                                :: "r" (chrSrc + (dstW>>1)), "r" (uDest + (dstW>>1)),
701
                                "g" (-(dstW>>1))
702
                                : "%eax"
703
                        );
704

    
705
                asm volatile(
706
                                YSCALEYUV2YV121
707
                                :: "r" (chrSrc + 2048 + (dstW>>1)), "r" (vDest + (dstW>>1)),
708
                                "g" (-(dstW>>1))
709
                                : "%eax"
710
                        );
711
        }
712

    
713
        asm volatile(
714
                YSCALEYUV2YV121
715
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
716
                "g" (-dstW)
717
                : "%eax"
718
        );
719
#else
720
        //FIXME Optimize (just quickly writen not opti..)
721
        //FIXME replace MINMAX with LUTs
722
        int i;
723
        for(i=0; i<dstW; i++)
724
        {
725
                int val= lumSrc[i]>>7;
726

    
727
                dest[i]= MIN(MAX(val>>19, 0), 255);
728
        }
729

    
730
        if(uDest != NULL)
731
                for(i=0; i<(dstW>>1); i++)
732
                {
733
                        int u=chrSrc[i]>>7;
734
                        int v=chrSrc[i + 2048]>>7;
735

    
736
                        uDest[i]= MIN(MAX(u>>19, 0), 255);
737
                        vDest[i]= MIN(MAX(v>>19, 0), 255);
738
                }
739
#endif
740
}
741

    
742

    
743
/**
744
 * vertical scale YV12 to RGB
745
 */
746
static inline void RENAME(yuv2rgbX)(int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
747
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
748
                            uint8_t *dest, int dstW, int dstFormat, int16_t * lumMmxFilter, int16_t * chrMmxFilter)
749
{
750
/*        if(flags&SWS_FULL_UV_IPOL)
751
        {
752
//FIXME
753
        }//FULL_UV_IPOL
754
        else*/
755
        {
756
#ifdef HAVE_MMX
757
                if(dstFormat == IMGFMT_BGR32) //FIXME untested
758
                {
759
                        asm volatile(
760
                                YSCALEYUV2RGBX
761
                                WRITEBGR32
762

    
763
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
764
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
765
                           "r" (dest), "m" (dstW),
766
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
767
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
768
                        );
769
                }
770
                else if(dstFormat == IMGFMT_BGR24) //FIXME untested
771
                {
772
                        asm volatile(
773
                                YSCALEYUV2RGBX
774
                                "leal (%%eax, %%eax, 2), %%ebx        \n\t" //FIXME optimize
775
                                "addl %4, %%ebx                        \n\t"
776
                                WRITEBGR24
777

    
778
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
779
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
780
                           "r" (dest), "m" (dstW),
781
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
782
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
783
                        );
784
                }
785
                else if(dstFormat==IMGFMT_BGR15)
786
                {
787
                        asm volatile(
788
                                YSCALEYUV2RGBX
789
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
790
#ifdef DITHER1XBPP
791
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
792
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
793
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
794
#endif
795

    
796
                                WRITEBGR15
797

    
798
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
799
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
800
                           "r" (dest), "m" (dstW),
801
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
802
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
803
                        );
804
                }
805
                else if(dstFormat==IMGFMT_BGR16)
806
                {
807
                        asm volatile(
808
                                YSCALEYUV2RGBX
809
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
810
#ifdef DITHER1XBPP
811
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
812
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
813
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
814
#endif
815

    
816
                                WRITEBGR16
817

    
818
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
819
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
820
                           "r" (dest), "m" (dstW),
821
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
822
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
823
                        );
824
                }
825
#else
826
yuv2rgbXinC(lumFilter, lumSrc, lumFilterSize,
827
            chrFilter, chrSrc, chrFilterSize,
828
            dest, dstW, dstFormat);
829

    
830
#endif
831
        } //!FULL_UV_IPOL
832
}
833

    
834

    
835
/**
836
 * vertical bilinear scale YV12 to RGB
837
 */
838
static inline void RENAME(yuv2rgb2)(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
839
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int dstFormat, int flags)
840
{
841
        int yalpha1=yalpha^4095;
842
        int uvalpha1=uvalpha^4095;
843

    
844
        if(flags&SWS_FULL_CHR_H_INT)
845
        {
846

    
847
#ifdef HAVE_MMX
848
                if(dstFormat==IMGFMT_BGR32)
849
                {
850
                        asm volatile(
851

    
852

    
853
FULL_YSCALEYUV2RGB
854
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
855
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
856

    
857
                        "movq %%mm3, %%mm1                \n\t"
858
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
859
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
860

    
861
                        MOVNTQ(%%mm3, (%4, %%eax, 4))
862
                        MOVNTQ(%%mm1, 8(%4, %%eax, 4))
863

    
864
                        "addl $4, %%eax                        \n\t"
865
                        "cmpl %5, %%eax                        \n\t"
866
                        " jb 1b                                \n\t"
867

    
868

    
869
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
870
                        "m" (yalpha1), "m" (uvalpha1)
871
                        : "%eax"
872
                        );
873
                }
874
                else if(dstFormat==IMGFMT_BGR24)
875
                {
876
                        asm volatile(
877

    
878
FULL_YSCALEYUV2RGB
879

    
880
                                                                // lsb ... msb
881
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
882
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
883

    
884
                        "movq %%mm3, %%mm1                \n\t"
885
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
886
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
887

    
888
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
889
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
890
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
891
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
892
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
893
                        "movq %%mm1, %%mm2                \n\t"
894
                        "psllq $48, %%mm1                \n\t" // 000000BG
895
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
896

    
897
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
898
                        "psrld $16, %%mm2                \n\t" // R000R000
899
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
900
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
901

    
902
                        "movl %4, %%ebx                        \n\t"
903
                        "addl %%eax, %%ebx                \n\t"
904

    
905
#ifdef HAVE_MMX2
906
                        //FIXME Alignment
907
                        "movntq %%mm3, (%%ebx, %%eax, 2)\n\t"
908
                        "movntq %%mm1, 8(%%ebx, %%eax, 2)\n\t"
909
#else
910
                        "movd %%mm3, (%%ebx, %%eax, 2)        \n\t"
911
                        "psrlq $32, %%mm3                \n\t"
912
                        "movd %%mm3, 4(%%ebx, %%eax, 2)        \n\t"
913
                        "movd %%mm1, 8(%%ebx, %%eax, 2)        \n\t"
914
#endif
915
                        "addl $4, %%eax                        \n\t"
916
                        "cmpl %5, %%eax                        \n\t"
917
                        " jb 1b                                \n\t"
918

    
919
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
920
                        "m" (yalpha1), "m" (uvalpha1)
921
                        : "%eax", "%ebx"
922
                        );
923
                }
924
                else if(dstFormat==IMGFMT_BGR15)
925
                {
926
                        asm volatile(
927

    
928
FULL_YSCALEYUV2RGB
929
#ifdef DITHER1XBPP
930
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
931
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
932
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
933
#endif
934
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
935
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
936
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
937

    
938
                        "psrlw $3, %%mm3                \n\t"
939
                        "psllw $2, %%mm1                \n\t"
940
                        "psllw $7, %%mm0                \n\t"
941
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
942
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
943

    
944
                        "por %%mm3, %%mm1                \n\t"
945
                        "por %%mm1, %%mm0                \n\t"
946

    
947
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
948

    
949
                        "addl $4, %%eax                        \n\t"
950
                        "cmpl %5, %%eax                        \n\t"
951
                        " jb 1b                                \n\t"
952

    
953
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
954
                        "m" (yalpha1), "m" (uvalpha1)
955
                        : "%eax"
956
                        );
957
                }
958
                else if(dstFormat==IMGFMT_BGR16)
959
                {
960
                        asm volatile(
961

    
962
FULL_YSCALEYUV2RGB
963
#ifdef DITHER1XBPP
964
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
965
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
966
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
967
#endif
968
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
969
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
970
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
971

    
972
                        "psrlw $3, %%mm3                \n\t"
973
                        "psllw $3, %%mm1                \n\t"
974
                        "psllw $8, %%mm0                \n\t"
975
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
976
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
977

    
978
                        "por %%mm3, %%mm1                \n\t"
979
                        "por %%mm1, %%mm0                \n\t"
980

    
981
                        MOVNTQ(%%mm0, (%4, %%eax, 2))
982

    
983
                        "addl $4, %%eax                        \n\t"
984
                        "cmpl %5, %%eax                        \n\t"
985
                        " jb 1b                                \n\t"
986

    
987
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
988
                        "m" (yalpha1), "m" (uvalpha1)
989
                        : "%eax"
990
                        );
991
                }
992
#else
993
                if(dstFormat==IMGFMT_BGR32)
994
                {
995
#ifdef WORDS_BIGENDIAN
996
                        dest++;
997
#endif
998
                        int i;
999
                        for(i=0;i<dstW;i++){
1000
                                // vertical linear interpolation && yuv2rgb in a single step:
1001
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1002
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1003
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1004
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1005
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1006
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1007
                                dest+= 4;
1008
                        }
1009
                }
1010
                else if(dstFormat==IMGFMT_BGR24)
1011
                {
1012
                        int i;
1013
                        for(i=0;i<dstW;i++){
1014
                                // vertical linear interpolation && yuv2rgb in a single step:
1015
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1016
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1017
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1018
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1019
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1020
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1021
                                dest+= 3;
1022
                        }
1023
                }
1024
                else if(dstFormat==IMGFMT_BGR16)
1025
                {
1026
                        int i;
1027
                        for(i=0;i<dstW;i++){
1028
                                // vertical linear interpolation && yuv2rgb in a single step:
1029
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1030
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1031
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1032

    
1033
                                ((uint16_t*)dest)[i] =
1034
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1035
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1036
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1037
                        }
1038
                }
1039
                else if(dstFormat==IMGFMT_BGR15)
1040
                {
1041
                        int i;
1042
                        for(i=0;i<dstW;i++){
1043
                                // vertical linear interpolation && yuv2rgb in a single step:
1044
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1045
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1046
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1047

    
1048
                                ((uint16_t*)dest)[i] =
1049
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1050
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1051
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1052
                        }
1053
                }
1054
#endif
1055
        }//FULL_UV_IPOL
1056
        else
1057
        {
1058
#ifdef HAVE_MMX
1059
                if(dstFormat==IMGFMT_BGR32)
1060
                {
1061
                        asm volatile(
1062
                                YSCALEYUV2RGB
1063
                                WRITEBGR32
1064

    
1065
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1066
                        "m" (yalpha1), "m" (uvalpha1)
1067
                        : "%eax"
1068
                        );
1069
                }
1070
                else if(dstFormat==IMGFMT_BGR24)
1071
                {
1072
                        asm volatile(
1073
                                "movl %4, %%ebx                        \n\t"
1074
                                YSCALEYUV2RGB
1075
                                WRITEBGR24
1076

    
1077
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1078
                        "m" (yalpha1), "m" (uvalpha1)
1079
                        : "%eax", "%ebx"
1080
                        );
1081
                }
1082
                else if(dstFormat==IMGFMT_BGR15)
1083
                {
1084
                        asm volatile(
1085
                                YSCALEYUV2RGB
1086
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1087
#ifdef DITHER1XBPP
1088
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1089
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1090
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1091
#endif
1092

    
1093
                                WRITEBGR15
1094

    
1095
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1096
                        "m" (yalpha1), "m" (uvalpha1)
1097
                        : "%eax"
1098
                        );
1099
                }
1100
                else if(dstFormat==IMGFMT_BGR16)
1101
                {
1102
                        asm volatile(
1103
                                YSCALEYUV2RGB
1104
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1105
#ifdef DITHER1XBPP
1106
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1107
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1108
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1109
#endif
1110

    
1111
                                WRITEBGR16
1112

    
1113
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1114
                        "m" (yalpha1), "m" (uvalpha1)
1115
                        : "%eax"
1116
                        );
1117
                }
1118
#else
1119
                if(dstFormat==IMGFMT_BGR32)
1120
                {
1121
#ifdef WORDS_BIGENDIAN
1122
                        dest++;
1123
#endif
1124
                        int i;
1125
                        for(i=0; i<dstW-1; i+=2){
1126
                                // vertical linear interpolation && yuv2rgb in a single step:
1127
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1128
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1129
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1130
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1131

    
1132
                                int Cb= yuvtab_40cf[U];
1133
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1134
                                int Cr= yuvtab_3343[V];
1135

    
1136
                                dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1137
                                dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1138
                                dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1139

    
1140
                                dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1141
                                dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1142
                                dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1143
                        }
1144
                }
1145
                else if(dstFormat==IMGFMT_BGR24)
1146
                {
1147
                        int i;
1148
                        for(i=0; i<dstW-1; i+=2){
1149
                                // vertical linear interpolation && yuv2rgb in a single step:
1150
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1151
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1152
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1153
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1154

    
1155
                                int Cb= yuvtab_40cf[U];
1156
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1157
                                int Cr= yuvtab_3343[V];
1158

    
1159
                                dest[0]=clip_table[((Y1 + Cb) >>13)];
1160
                                dest[1]=clip_table[((Y1 + Cg) >>13)];
1161
                                dest[2]=clip_table[((Y1 + Cr) >>13)];
1162

    
1163
                                dest[3]=clip_table[((Y2 + Cb) >>13)];
1164
                                dest[4]=clip_table[((Y2 + Cg) >>13)];
1165
                                dest[5]=clip_table[((Y2 + Cr) >>13)];
1166
                                dest+=6;
1167
                        }
1168
                }
1169
                else if(dstFormat==IMGFMT_BGR16)
1170
                {
1171
                        int i;
1172
#ifdef DITHER1XBPP
1173
                        static int ditherb1=1<<14;
1174
                        static int ditherg1=1<<13;
1175
                        static int ditherr1=2<<14;
1176
                        static int ditherb2=3<<14;
1177
                        static int ditherg2=3<<13;
1178
                        static int ditherr2=0<<14;
1179

    
1180
                        ditherb1 ^= (1^2)<<14;
1181
                        ditherg1 ^= (1^2)<<13;
1182
                        ditherr1 ^= (1^2)<<14;
1183
                        ditherb2 ^= (3^0)<<14;
1184
                        ditherg2 ^= (3^0)<<13;
1185
                        ditherr2 ^= (3^0)<<14;
1186
#else
1187
                        const int ditherb1=0;
1188
                        const int ditherg1=0;
1189
                        const int ditherr1=0;
1190
                        const int ditherb2=0;
1191
                        const int ditherg2=0;
1192
                        const int ditherr2=0;
1193
#endif
1194
                        for(i=0; i<dstW-1; i+=2){
1195
                                // vertical linear interpolation && yuv2rgb in a single step:
1196
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1197
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1198
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1199
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1200

    
1201
                                int Cb= yuvtab_40cf[U];
1202
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1203
                                int Cr= yuvtab_3343[V];
1204

    
1205
                                ((uint16_t*)dest)[i] =
1206
                                        clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1207
                                        clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1208
                                        clip_table16r[(Y1 + Cr + ditherr1) >>13];
1209

    
1210
                                ((uint16_t*)dest)[i+1] =
1211
                                        clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1212
                                        clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1213
                                        clip_table16r[(Y2 + Cr + ditherr2) >>13];
1214
                        }
1215
                }
1216
                else if(dstFormat==IMGFMT_BGR15)
1217
                {
1218
                        int i;
1219
#ifdef DITHER1XBPP
1220
                        static int ditherb1=1<<14;
1221
                        static int ditherg1=1<<14;
1222
                        static int ditherr1=2<<14;
1223
                        static int ditherb2=3<<14;
1224
                        static int ditherg2=3<<14;
1225
                        static int ditherr2=0<<14;
1226

    
1227
                        ditherb1 ^= (1^2)<<14;
1228
                        ditherg1 ^= (1^2)<<14;
1229
                        ditherr1 ^= (1^2)<<14;
1230
                        ditherb2 ^= (3^0)<<14;
1231
                        ditherg2 ^= (3^0)<<14;
1232
                        ditherr2 ^= (3^0)<<14;
1233
#else
1234
                        const int ditherb1=0;
1235
                        const int ditherg1=0;
1236
                        const int ditherr1=0;
1237
                        const int ditherb2=0;
1238
                        const int ditherg2=0;
1239
                        const int ditherr2=0;
1240
#endif
1241
                        for(i=0; i<dstW-1; i+=2){
1242
                                // vertical linear interpolation && yuv2rgb in a single step:
1243
                                int Y1=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1244
                                int Y2=yuvtab_2568[((buf0[i+1]*yalpha1+buf1[i+1]*yalpha)>>19)];
1245
                                int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1246
                                int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1247

    
1248
                                int Cb= yuvtab_40cf[U];
1249
                                int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1250
                                int Cr= yuvtab_3343[V];
1251

    
1252
                                ((uint16_t*)dest)[i] =
1253
                                        clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1254
                                        clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1255
                                        clip_table15r[(Y1 + Cr + ditherr1) >>13];
1256

    
1257
                                ((uint16_t*)dest)[i+1] =
1258
                                        clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1259
                                        clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1260
                                        clip_table15r[(Y2 + Cr + ditherr2) >>13];
1261
                        }
1262
                }
1263
#endif
1264
        } //!FULL_UV_IPOL
1265
}
1266

    
1267
/**
1268
 * YV12 to RGB without scaling or interpolating
1269
 */
1270
static inline void RENAME(yuv2rgb1)(uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1271
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags)
1272
{
1273
        int uvalpha1=uvalpha^4095;
1274
        const int yalpha1=0;
1275

    
1276
        if(flags&SWS_FULL_CHR_H_INT)
1277
        {
1278
                RENAME(yuv2rgb2)(buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, dstFormat, flags);
1279
                return;
1280
        }
1281

    
1282
#ifdef HAVE_MMX
1283
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1284
        {
1285
                if(dstFormat==IMGFMT_BGR32)
1286
                {
1287
                        asm volatile(
1288
                                YSCALEYUV2RGB1
1289
                                WRITEBGR32
1290
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1291
                        "m" (yalpha1), "m" (uvalpha1)
1292
                        : "%eax"
1293
                        );
1294
                }
1295
                else if(dstFormat==IMGFMT_BGR24)
1296
                {
1297
                        asm volatile(
1298
                                "movl %4, %%ebx                        \n\t"
1299
                                YSCALEYUV2RGB1
1300
                                WRITEBGR24
1301
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1302
                        "m" (yalpha1), "m" (uvalpha1)
1303
                        : "%eax", "%ebx"
1304
                        );
1305
                }
1306
                else if(dstFormat==IMGFMT_BGR15)
1307
                {
1308
                        asm volatile(
1309
                                YSCALEYUV2RGB1
1310
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1311
#ifdef DITHER1XBPP
1312
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1313
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1314
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1315
#endif
1316
                                WRITEBGR15
1317
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1318
                        "m" (yalpha1), "m" (uvalpha1)
1319
                        : "%eax"
1320
                        );
1321
                }
1322
                else if(dstFormat==IMGFMT_BGR16)
1323
                {
1324
                        asm volatile(
1325
                                YSCALEYUV2RGB1
1326
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327
#ifdef DITHER1XBPP
1328
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1329
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1330
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1331
#endif
1332

    
1333
                                WRITEBGR16
1334
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1335
                        "m" (yalpha1), "m" (uvalpha1)
1336
                        : "%eax"
1337
                        );
1338
                }
1339
        }
1340
        else
1341
        {
1342
                if(dstFormat==IMGFMT_BGR32)
1343
                {
1344
                        asm volatile(
1345
                                YSCALEYUV2RGB1b
1346
                                WRITEBGR32
1347
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1348
                        "m" (yalpha1), "m" (uvalpha1)
1349
                        : "%eax"
1350
                        );
1351
                }
1352
                else if(dstFormat==IMGFMT_BGR24)
1353
                {
1354
                        asm volatile(
1355
                                "movl %4, %%ebx                        \n\t"
1356
                                YSCALEYUV2RGB1b
1357
                                WRITEBGR24
1358
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1359
                        "m" (yalpha1), "m" (uvalpha1)
1360
                        : "%eax", "%ebx"
1361
                        );
1362
                }
1363
                else if(dstFormat==IMGFMT_BGR15)
1364
                {
1365
                        asm volatile(
1366
                                YSCALEYUV2RGB1b
1367
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1368
#ifdef DITHER1XBPP
1369
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1370
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1371
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1372
#endif
1373
                                WRITEBGR15
1374
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1375
                        "m" (yalpha1), "m" (uvalpha1)
1376
                        : "%eax"
1377
                        );
1378
                }
1379
                else if(dstFormat==IMGFMT_BGR16)
1380
                {
1381
                        asm volatile(
1382
                                YSCALEYUV2RGB1b
1383
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1384
#ifdef DITHER1XBPP
1385
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1386
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1387
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1388
#endif
1389

    
1390
                                WRITEBGR16
1391
                        :: "r" (buf0), "r" (buf0), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1392
                        "m" (yalpha1), "m" (uvalpha1)
1393
                        : "%eax"
1394
                        );
1395
                }
1396
        }
1397
#else
1398
//FIXME write 2 versions (for even & odd lines)
1399

    
1400
        if(dstFormat==IMGFMT_BGR32)
1401
        {
1402
#ifdef WORDS_BIGENDIAN
1403
                dest++;
1404
#endif
1405
                int i;
1406
                for(i=0; i<dstW-1; i+=2){
1407
                        // vertical linear interpolation && yuv2rgb in a single step:
1408
                        int Y1=yuvtab_2568[buf0[i]>>7];
1409
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1410
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1411
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1412

    
1413
                        int Cb= yuvtab_40cf[U];
1414
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1415
                        int Cr= yuvtab_3343[V];
1416

    
1417
                        dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
1418
                        dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
1419
                        dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
1420

    
1421
                        dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
1422
                        dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
1423
                        dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
1424
                }
1425
        }
1426
        else if(dstFormat==IMGFMT_BGR24)
1427
        {
1428
                int i;
1429
                for(i=0; i<dstW-1; i+=2){
1430
                        // vertical linear interpolation && yuv2rgb in a single step:
1431
                        int Y1=yuvtab_2568[buf0[i]>>7];
1432
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1433
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1434
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1435

    
1436
                        int Cb= yuvtab_40cf[U];
1437
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1438
                        int Cr= yuvtab_3343[V];
1439

    
1440
                        dest[0]=clip_table[((Y1 + Cb) >>13)];
1441
                        dest[1]=clip_table[((Y1 + Cg) >>13)];
1442
                        dest[2]=clip_table[((Y1 + Cr) >>13)];
1443

    
1444
                        dest[3]=clip_table[((Y2 + Cb) >>13)];
1445
                        dest[4]=clip_table[((Y2 + Cg) >>13)];
1446
                        dest[5]=clip_table[((Y2 + Cr) >>13)];
1447
                        dest+=6;
1448
                }
1449
        }
1450
        else if(dstFormat==IMGFMT_BGR16)
1451
        {
1452
                int i;
1453
#ifdef DITHER1XBPP
1454
                static int ditherb1=1<<14;
1455
                static int ditherg1=1<<13;
1456
                static int ditherr1=2<<14;
1457
                static int ditherb2=3<<14;
1458
                static int ditherg2=3<<13;
1459
                static int ditherr2=0<<14;
1460

    
1461
                ditherb1 ^= (1^2)<<14;
1462
                ditherg1 ^= (1^2)<<13;
1463
                ditherr1 ^= (1^2)<<14;
1464
                ditherb2 ^= (3^0)<<14;
1465
                ditherg2 ^= (3^0)<<13;
1466
                ditherr2 ^= (3^0)<<14;
1467
#else
1468
                const int ditherb1=0;
1469
                const int ditherg1=0;
1470
                const int ditherr1=0;
1471
                const int ditherb2=0;
1472
                const int ditherg2=0;
1473
                const int ditherr2=0;
1474
#endif
1475
                for(i=0; i<dstW-1; i+=2){
1476
                        // vertical linear interpolation && yuv2rgb in a single step:
1477
                        int Y1=yuvtab_2568[buf0[i]>>7];
1478
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1479
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1480
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1481

    
1482
                        int Cb= yuvtab_40cf[U];
1483
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1484
                        int Cr= yuvtab_3343[V];
1485

    
1486
                        ((uint16_t*)dest)[i] =
1487
                                clip_table16b[(Y1 + Cb + ditherb1) >>13] |
1488
                                clip_table16g[(Y1 + Cg + ditherg1) >>13] |
1489
                                clip_table16r[(Y1 + Cr + ditherr1) >>13];
1490

    
1491
                        ((uint16_t*)dest)[i+1] =
1492
                                clip_table16b[(Y2 + Cb + ditherb2) >>13] |
1493
                                clip_table16g[(Y2 + Cg + ditherg2) >>13] |
1494
                                clip_table16r[(Y2 + Cr + ditherr2) >>13];
1495
                }
1496
        }
1497
        else if(dstFormat==IMGFMT_BGR15)
1498
        {
1499
                int i;
1500
#ifdef DITHER1XBPP
1501
                static int ditherb1=1<<14;
1502
                static int ditherg1=1<<14;
1503
                static int ditherr1=2<<14;
1504
                static int ditherb2=3<<14;
1505
                static int ditherg2=3<<14;
1506
                static int ditherr2=0<<14;
1507

    
1508
                ditherb1 ^= (1^2)<<14;
1509
                ditherg1 ^= (1^2)<<14;
1510
                ditherr1 ^= (1^2)<<14;
1511
                ditherb2 ^= (3^0)<<14;
1512
                ditherg2 ^= (3^0)<<14;
1513
                ditherr2 ^= (3^0)<<14;
1514
#else
1515
                const int ditherb1=0;
1516
                const int ditherg1=0;
1517
                const int ditherr1=0;
1518
                const int ditherb2=0;
1519
                const int ditherg2=0;
1520
                const int ditherr2=0;
1521
#endif
1522
                for(i=0; i<dstW-1; i+=2){
1523
                        // vertical linear interpolation && yuv2rgb in a single step:
1524
                        int Y1=yuvtab_2568[buf0[i]>>7];
1525
                        int Y2=yuvtab_2568[buf0[i+1]>>7];
1526
                        int U=((uvbuf0[i>>1]*uvalpha1+uvbuf1[i>>1]*uvalpha)>>19);
1527
                        int V=((uvbuf0[(i>>1)+2048]*uvalpha1+uvbuf1[(i>>1)+2048]*uvalpha)>>19);
1528

    
1529
                        int Cb= yuvtab_40cf[U];
1530
                        int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
1531
                        int Cr= yuvtab_3343[V];
1532

    
1533
                        ((uint16_t*)dest)[i] =
1534
                                clip_table15b[(Y1 + Cb + ditherb1) >>13] |
1535
                                clip_table15g[(Y1 + Cg + ditherg1) >>13] |
1536
                                clip_table15r[(Y1 + Cr + ditherr1) >>13];
1537

    
1538
                        ((uint16_t*)dest)[i+1] =
1539
                                clip_table15b[(Y2 + Cb + ditherb2) >>13] |
1540
                                clip_table15g[(Y2 + Cg + ditherg2) >>13] |
1541
                                clip_table15r[(Y2 + Cr + ditherr2) >>13];
1542
                }
1543
        }
1544
#endif
1545
}
1546

    
1547
//FIXME yuy2* can read upto 7 samples to much
1548

    
1549
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, int width)
1550
{
1551
#ifdef HAVE_MMX
1552
        asm volatile(
1553
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1554
                "movl %0, %%eax                        \n\t"
1555
                "1:                                \n\t"
1556
                "movq (%1, %%eax,2), %%mm0        \n\t"
1557
                "movq 8(%1, %%eax,2), %%mm1        \n\t"
1558
                "pand %%mm2, %%mm0                \n\t"
1559
                "pand %%mm2, %%mm1                \n\t"
1560
                "packuswb %%mm1, %%mm0                \n\t"
1561
                "movq %%mm0, (%2, %%eax)        \n\t"
1562
                "addl $8, %%eax                        \n\t"
1563
                " js 1b                                \n\t"
1564
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1565
                : "%eax"
1566
        );
1567
#else
1568
        int i;
1569
        for(i=0; i<width; i++)
1570
                dst[i]= src[2*i];
1571
#endif
1572
}
1573

    
1574
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1575
{
1576
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1577
        asm volatile(
1578
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1579
                "movl %0, %%eax                        \n\t"
1580
                "1:                                \n\t"
1581
                "movq (%1, %%eax,4), %%mm0        \n\t"
1582
                "movq 8(%1, %%eax,4), %%mm1        \n\t"
1583
                "movq (%2, %%eax,4), %%mm2        \n\t"
1584
                "movq 8(%2, %%eax,4), %%mm3        \n\t"
1585
                PAVGB(%%mm2, %%mm0)
1586
                PAVGB(%%mm3, %%mm1)
1587
                "psrlw $8, %%mm0                \n\t"
1588
                "psrlw $8, %%mm1                \n\t"
1589
                "packuswb %%mm1, %%mm0                \n\t"
1590
                "movq %%mm0, %%mm1                \n\t"
1591
                "psrlw $8, %%mm0                \n\t"
1592
                "pand %%mm4, %%mm1                \n\t"
1593
                "packuswb %%mm0, %%mm0                \n\t"
1594
                "packuswb %%mm1, %%mm1                \n\t"
1595
                "movd %%mm0, (%4, %%eax)        \n\t"
1596
                "movd %%mm1, (%3, %%eax)        \n\t"
1597
                "addl $4, %%eax                        \n\t"
1598
                " js 1b                                \n\t"
1599
                : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1600
                : "%eax"
1601
        );
1602
#else
1603
        int i;
1604
        for(i=0; i<width; i++)
1605
        {
1606
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1607
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1608
        }
1609
#endif
1610
}
1611

    
1612
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1613
{
1614
#ifdef HAVE_MMXFIXME
1615
#else
1616
        int i;
1617
        for(i=0; i<width; i++)
1618
        {
1619
                int b= src[i*4+0];
1620
                int g= src[i*4+1];
1621
                int r= src[i*4+2];
1622

    
1623
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1624
        }
1625
#endif
1626
}
1627

    
1628
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1629
{
1630
#ifdef HAVE_MMXFIXME
1631
#else
1632
        int i;
1633
        for(i=0; i<width; i++)
1634
        {
1635
                int b= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
1636
                int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
1637
                int r= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
1638

    
1639
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1640
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1641
        }
1642
#endif
1643
}
1644

    
1645
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, int width)
1646
{
1647
#ifdef HAVE_MMX
1648
        asm volatile(
1649
                "movl %2, %%eax                        \n\t"
1650
                "movq bgr2YCoeff, %%mm6                \n\t"
1651
                "movq w1111, %%mm5                \n\t"
1652
                "pxor %%mm7, %%mm7                \n\t"
1653
                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1654
                ".balign 16                        \n\t"
1655
                "1:                                \n\t"
1656
                PREFETCH" 64(%0, %%ebx)                \n\t"
1657
                "movd (%0, %%ebx), %%mm0        \n\t"
1658
                "movd 3(%0, %%ebx), %%mm1        \n\t"
1659
                "punpcklbw %%mm7, %%mm0                \n\t"
1660
                "punpcklbw %%mm7, %%mm1                \n\t"
1661
                "movd 6(%0, %%ebx), %%mm2        \n\t"
1662
                "movd 9(%0, %%ebx), %%mm3        \n\t"
1663
                "punpcklbw %%mm7, %%mm2                \n\t"
1664
                "punpcklbw %%mm7, %%mm3                \n\t"
1665
                "pmaddwd %%mm6, %%mm0                \n\t"
1666
                "pmaddwd %%mm6, %%mm1                \n\t"
1667
                "pmaddwd %%mm6, %%mm2                \n\t"
1668
                "pmaddwd %%mm6, %%mm3                \n\t"
1669
#ifndef FAST_BGR2YV12
1670
                "psrad $8, %%mm0                \n\t"
1671
                "psrad $8, %%mm1                \n\t"
1672
                "psrad $8, %%mm2                \n\t"
1673
                "psrad $8, %%mm3                \n\t"
1674
#endif
1675
                "packssdw %%mm1, %%mm0                \n\t"
1676
                "packssdw %%mm3, %%mm2                \n\t"
1677
                "pmaddwd %%mm5, %%mm0                \n\t"
1678
                "pmaddwd %%mm5, %%mm2                \n\t"
1679
                "packssdw %%mm2, %%mm0                \n\t"
1680
                "psraw $7, %%mm0                \n\t"
1681

    
1682
                "movd 12(%0, %%ebx), %%mm4        \n\t"
1683
                "movd 15(%0, %%ebx), %%mm1        \n\t"
1684
                "punpcklbw %%mm7, %%mm4                \n\t"
1685
                "punpcklbw %%mm7, %%mm1                \n\t"
1686
                "movd 18(%0, %%ebx), %%mm2        \n\t"
1687
                "movd 21(%0, %%ebx), %%mm3        \n\t"
1688
                "punpcklbw %%mm7, %%mm2                \n\t"
1689
                "punpcklbw %%mm7, %%mm3                \n\t"
1690
                "pmaddwd %%mm6, %%mm4                \n\t"
1691
                "pmaddwd %%mm6, %%mm1                \n\t"
1692
                "pmaddwd %%mm6, %%mm2                \n\t"
1693
                "pmaddwd %%mm6, %%mm3                \n\t"
1694
#ifndef FAST_BGR2YV12
1695
                "psrad $8, %%mm4                \n\t"
1696
                "psrad $8, %%mm1                \n\t"
1697
                "psrad $8, %%mm2                \n\t"
1698
                "psrad $8, %%mm3                \n\t"
1699
#endif
1700
                "packssdw %%mm1, %%mm4                \n\t"
1701
                "packssdw %%mm3, %%mm2                \n\t"
1702
                "pmaddwd %%mm5, %%mm4                \n\t"
1703
                "pmaddwd %%mm5, %%mm2                \n\t"
1704
                "addl $24, %%ebx                \n\t"
1705
                "packssdw %%mm2, %%mm4                \n\t"
1706
                "psraw $7, %%mm4                \n\t"
1707

    
1708
                "packuswb %%mm4, %%mm0                \n\t"
1709
                "paddusb bgr2YOffset, %%mm0        \n\t"
1710

    
1711
                "movq %%mm0, (%1, %%eax)        \n\t"
1712
                "addl $8, %%eax                        \n\t"
1713
                " js 1b                                \n\t"
1714
                : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1715
                : "%eax", "%ebx"
1716
        );
1717
#else
1718
        int i;
1719
        for(i=0; i<width; i++)
1720
        {
1721
                int b= src[i*3+0];
1722
                int g= src[i*3+1];
1723
                int r= src[i*3+2];
1724

    
1725
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1726
        }
1727
#endif
1728
}
1729

    
1730
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1731
{
1732
#ifdef HAVE_MMX
1733
        asm volatile(
1734
                "movl %4, %%eax                        \n\t"
1735
                "movq w1111, %%mm5                \n\t"
1736
                "movq bgr2UCoeff, %%mm6                \n\t"
1737
                "pxor %%mm7, %%mm7                \n\t"
1738
                "leal (%%eax, %%eax, 2), %%ebx        \n\t"
1739
                "addl %%ebx, %%ebx                \n\t"
1740
                ".balign 16                        \n\t"
1741
                "1:                                \n\t"
1742
                PREFETCH" 64(%0, %%ebx)                \n\t"
1743
                PREFETCH" 64(%1, %%ebx)                \n\t"
1744
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1745
                "movq (%0, %%ebx), %%mm0        \n\t"
1746
                "movq (%1, %%ebx), %%mm1        \n\t"
1747
                "movq 6(%0, %%ebx), %%mm2        \n\t"
1748
                "movq 6(%1, %%ebx), %%mm3        \n\t"
1749
                PAVGB(%%mm1, %%mm0)
1750
                PAVGB(%%mm3, %%mm2)
1751
                "movq %%mm0, %%mm1                \n\t"
1752
                "movq %%mm2, %%mm3                \n\t"
1753
                "psrlq $24, %%mm0                \n\t"
1754
                "psrlq $24, %%mm2                \n\t"
1755
                PAVGB(%%mm1, %%mm0)
1756
                PAVGB(%%mm3, %%mm2)
1757
                "punpcklbw %%mm7, %%mm0                \n\t"
1758
                "punpcklbw %%mm7, %%mm2                \n\t"
1759
#else
1760
                "movd (%0, %%ebx), %%mm0        \n\t"
1761
                "movd (%1, %%ebx), %%mm1        \n\t"
1762
                "movd 3(%0, %%ebx), %%mm2        \n\t"
1763
                "movd 3(%1, %%ebx), %%mm3        \n\t"
1764
                "punpcklbw %%mm7, %%mm0                \n\t"
1765
                "punpcklbw %%mm7, %%mm1                \n\t"
1766
                "punpcklbw %%mm7, %%mm2                \n\t"
1767
                "punpcklbw %%mm7, %%mm3                \n\t"
1768
                "paddw %%mm1, %%mm0                \n\t"
1769
                "paddw %%mm3, %%mm2                \n\t"
1770
                "paddw %%mm2, %%mm0                \n\t"
1771
                "movd 6(%0, %%ebx), %%mm4        \n\t"
1772
                "movd 6(%1, %%ebx), %%mm1        \n\t"
1773
                "movd 9(%0, %%ebx), %%mm2        \n\t"
1774
                "movd 9(%1, %%ebx), %%mm3        \n\t"
1775
                "punpcklbw %%mm7, %%mm4                \n\t"
1776
                "punpcklbw %%mm7, %%mm1                \n\t"
1777
                "punpcklbw %%mm7, %%mm2                \n\t"
1778
                "punpcklbw %%mm7, %%mm3                \n\t"
1779
                "paddw %%mm1, %%mm4                \n\t"
1780
                "paddw %%mm3, %%mm2                \n\t"
1781
                "paddw %%mm4, %%mm2                \n\t"
1782
                "psrlw $2, %%mm0                \n\t"
1783
                "psrlw $2, %%mm2                \n\t"
1784
#endif
1785
                "movq bgr2VCoeff, %%mm1                \n\t"
1786
                "movq bgr2VCoeff, %%mm3                \n\t"
1787
                
1788
                "pmaddwd %%mm0, %%mm1                \n\t"
1789
                "pmaddwd %%mm2, %%mm3                \n\t"
1790
                "pmaddwd %%mm6, %%mm0                \n\t"
1791
                "pmaddwd %%mm6, %%mm2                \n\t"
1792
#ifndef FAST_BGR2YV12
1793
                "psrad $8, %%mm0                \n\t"
1794
                "psrad $8, %%mm1                \n\t"
1795
                "psrad $8, %%mm2                \n\t"
1796
                "psrad $8, %%mm3                \n\t"
1797
#endif
1798
                "packssdw %%mm2, %%mm0                \n\t"
1799
                "packssdw %%mm3, %%mm1                \n\t"
1800
                "pmaddwd %%mm5, %%mm0                \n\t"
1801
                "pmaddwd %%mm5, %%mm1                \n\t"
1802
                "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
1803
                "psraw $7, %%mm0                \n\t"
1804

    
1805
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1806
                "movq 12(%0, %%ebx), %%mm4        \n\t"
1807
                "movq 12(%1, %%ebx), %%mm1        \n\t"
1808
                "movq 18(%0, %%ebx), %%mm2        \n\t"
1809
                "movq 18(%1, %%ebx), %%mm3        \n\t"
1810
                PAVGB(%%mm1, %%mm4)
1811
                PAVGB(%%mm3, %%mm2)
1812
                "movq %%mm4, %%mm1                \n\t"
1813
                "movq %%mm2, %%mm3                \n\t"
1814
                "psrlq $24, %%mm4                \n\t"
1815
                "psrlq $24, %%mm2                \n\t"
1816
                PAVGB(%%mm1, %%mm4)
1817
                PAVGB(%%mm3, %%mm2)
1818
                "punpcklbw %%mm7, %%mm4                \n\t"
1819
                "punpcklbw %%mm7, %%mm2                \n\t"
1820
#else
1821
                "movd 12(%0, %%ebx), %%mm4        \n\t"
1822
                "movd 12(%1, %%ebx), %%mm1        \n\t"
1823
                "movd 15(%0, %%ebx), %%mm2        \n\t"
1824
                "movd 15(%1, %%ebx), %%mm3        \n\t"
1825
                "punpcklbw %%mm7, %%mm4                \n\t"
1826
                "punpcklbw %%mm7, %%mm1                \n\t"
1827
                "punpcklbw %%mm7, %%mm2                \n\t"
1828
                "punpcklbw %%mm7, %%mm3                \n\t"
1829
                "paddw %%mm1, %%mm4                \n\t"
1830
                "paddw %%mm3, %%mm2                \n\t"
1831
                "paddw %%mm2, %%mm4                \n\t"
1832
                "movd 18(%0, %%ebx), %%mm5        \n\t"
1833
                "movd 18(%1, %%ebx), %%mm1        \n\t"
1834
                "movd 21(%0, %%ebx), %%mm2        \n\t"
1835
                "movd 21(%1, %%ebx), %%mm3        \n\t"
1836
                "punpcklbw %%mm7, %%mm5                \n\t"
1837
                "punpcklbw %%mm7, %%mm1                \n\t"
1838
                "punpcklbw %%mm7, %%mm2                \n\t"
1839
                "punpcklbw %%mm7, %%mm3                \n\t"
1840
                "paddw %%mm1, %%mm5                \n\t"
1841
                "paddw %%mm3, %%mm2                \n\t"
1842
                "paddw %%mm5, %%mm2                \n\t"
1843
                "movq w1111, %%mm5                \n\t"
1844
                "psrlw $2, %%mm4                \n\t"
1845
                "psrlw $2, %%mm2                \n\t"
1846
#endif
1847
                "movq bgr2VCoeff, %%mm1                \n\t"
1848
                "movq bgr2VCoeff, %%mm3                \n\t"
1849
                
1850
                "pmaddwd %%mm4, %%mm1                \n\t"
1851
                "pmaddwd %%mm2, %%mm3                \n\t"
1852
                "pmaddwd %%mm6, %%mm4                \n\t"
1853
                "pmaddwd %%mm6, %%mm2                \n\t"
1854
#ifndef FAST_BGR2YV12
1855
                "psrad $8, %%mm4                \n\t"
1856
                "psrad $8, %%mm1                \n\t"
1857
                "psrad $8, %%mm2                \n\t"
1858
                "psrad $8, %%mm3                \n\t"
1859
#endif
1860
                "packssdw %%mm2, %%mm4                \n\t"
1861
                "packssdw %%mm3, %%mm1                \n\t"
1862
                "pmaddwd %%mm5, %%mm4                \n\t"
1863
                "pmaddwd %%mm5, %%mm1                \n\t"
1864
                "addl $24, %%ebx                \n\t"
1865
                "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
1866
                "psraw $7, %%mm4                \n\t"
1867
                
1868
                "movq %%mm0, %%mm1                \n\t"
1869
                "punpckldq %%mm4, %%mm0                \n\t"
1870
                "punpckhdq %%mm4, %%mm1                \n\t"
1871
                "packsswb %%mm1, %%mm0                \n\t"
1872
                "paddb bgr2UVOffset, %%mm0        \n\t"
1873

    
1874
                "movd %%mm0, (%2, %%eax)        \n\t"
1875
                "punpckhdq %%mm0, %%mm0                \n\t"
1876
                "movd %%mm0, (%3, %%eax)        \n\t"
1877
                "addl $4, %%eax                        \n\t"
1878
                " js 1b                                \n\t"
1879
                : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
1880
                : "%eax", "%ebx"
1881
        );
1882
#else
1883
        int i;
1884
        for(i=0; i<width; i++)
1885
        {
1886
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
1887
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
1888
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
1889

    
1890
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1891
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1892
        }
1893
#endif
1894
}
1895

    
1896
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
1897
{
1898
        int i;
1899
        for(i=0; i<width; i++)
1900
        {
1901
                int d= src[i*2] + (src[i*2+1]<<8);
1902
                int b= d&0x1F;
1903
                int g= (d>>5)&0x3F;
1904
                int r= (d>>11)&0x1F;
1905

    
1906
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
1907
        }
1908
}
1909

    
1910
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1911
{
1912
        int i;
1913
        for(i=0; i<width; i++)
1914
        {
1915
#if 1
1916
                int d0= le2me_32( ((uint32_t*)src1)[i] );
1917
                int d1= le2me_32( ((uint32_t*)src2)[i] );
1918
                
1919
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
1920
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
1921

    
1922
                int dh2= (dh>>11) + (dh<<21);
1923
                int d= dh2 + dl;
1924

    
1925
                int b= d&0x7F;
1926
                int r= (d>>11)&0x7F;
1927
                int g= d>>21;
1928
#else
1929
                int d0= src1[i*4] + (src1[i*4+1]<<8);
1930
                int b0= d0&0x1F;
1931
                int g0= (d0>>5)&0x3F;
1932
                int r0= (d0>>11)&0x1F;
1933

    
1934
                int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1935
                int b1= d1&0x1F;
1936
                int g1= (d1>>5)&0x3F;
1937
                int r1= (d1>>11)&0x1F;
1938

    
1939
                int d2= src2[i*4] + (src2[i*4+1]<<8);
1940
                int b2= d2&0x1F;
1941
                int g2= (d2>>5)&0x3F;
1942
                int r2= (d2>>11)&0x1F;
1943

    
1944
                int d3= src2[i*4+2] + (src2[i*4+3]<<8);
1945
                int b3= d3&0x1F;
1946
                int g3= (d3>>5)&0x3F;
1947
                int r3= (d3>>11)&0x1F;
1948

    
1949
                int b= b0 + b1 + b2 + b3;
1950
                int g= g0 + g1 + g2 + g3;
1951
                int r= r0 + r1 + r2 + r3;
1952
#endif
1953
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1954
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
1955
        }
1956
}
1957

    
1958
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
1959
{
1960
        int i;
1961
        for(i=0; i<width; i++)
1962
        {
1963
                int d= src[i*2] + (src[i*2+1]<<8);
1964
                int b= d&0x1F;
1965
                int g= (d>>5)&0x1F;
1966
                int r= (d>>10)&0x1F;
1967

    
1968
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
1969
        }
1970
}
1971

    
1972
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1973
{
1974
        int i;
1975
        for(i=0; i<width; i++)
1976
        {
1977
#if 1
1978
                int d0= le2me_32( ((uint32_t*)src1)[i] );
1979
                int d1= le2me_32( ((uint32_t*)src2)[i] );
1980
                
1981
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
1982
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
1983

    
1984
                int dh2= (dh>>11) + (dh<<21);
1985
                int d= dh2 + dl;
1986

    
1987
                int b= d&0x7F;
1988
                int r= (d>>10)&0x7F;
1989
                int g= d>>21;
1990
#else
1991
                int d0= src1[i*4] + (src1[i*4+1]<<8);
1992
                int b0= d0&0x1F;
1993
                int g0= (d0>>5)&0x1F;
1994
                int r0= (d0>>10)&0x1F;
1995

    
1996
                int d1= src1[i*4+2] + (src1[i*4+3]<<8);
1997
                int b1= d1&0x1F;
1998
                int g1= (d1>>5)&0x1F;
1999
                int r1= (d1>>10)&0x1F;
2000

    
2001
                int d2= src2[i*4] + (src2[i*4+1]<<8);
2002
                int b2= d2&0x1F;
2003
                int g2= (d2>>5)&0x1F;
2004
                int r2= (d2>>10)&0x1F;
2005

    
2006
                int d3= src2[i*4+2] + (src2[i*4+3]<<8);
2007
                int b3= d3&0x1F;
2008
                int g3= (d3>>5)&0x1F;
2009
                int r3= (d3>>10)&0x1F;
2010

    
2011
                int b= b0 + b1 + b2 + b3;
2012
                int g= g0 + g1 + g2 + g3;
2013
                int r= r0 + r1 + r2 + r3;
2014
#endif
2015
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2016
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2017
        }
2018
}
2019

    
2020

    
2021
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2022
{
2023
        int i;
2024
        for(i=0; i<width; i++)
2025
        {
2026
                int r= src[i*4+0];
2027
                int g= src[i*4+1];
2028
                int b= src[i*4+2];
2029

    
2030
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2031
        }
2032
}
2033

    
2034
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2035
{
2036
        int i;
2037
        for(i=0; i<width; i++)
2038
        {
2039
                int r= src1[8*i + 0] + src1[8*i + 4] + src2[8*i + 0] + src2[8*i + 4];
2040
                int g= src1[8*i + 1] + src1[8*i + 5] + src2[8*i + 1] + src2[8*i + 5];
2041
                int b= src1[8*i + 2] + src1[8*i + 6] + src2[8*i + 2] + src2[8*i + 6];
2042

    
2043
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2044
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2045
        }
2046
}
2047

    
2048
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2049
{
2050
        int i;
2051
        for(i=0; i<width; i++)
2052
        {
2053
                int r= src[i*3+0];
2054
                int g= src[i*3+1];
2055
                int b= src[i*3+2];
2056

    
2057
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2058
        }
2059
}
2060

    
2061
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2062
{
2063
        int i;
2064
        for(i=0; i<width; i++)
2065
        {
2066
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2067
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2068
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2069

    
2070
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2071
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2072
        }
2073
}
2074

    
2075

    
2076
// Bilinear / Bicubic scaling
2077
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2078
                                  int16_t *filter, int16_t *filterPos, int filterSize)
2079
{
2080
#ifdef HAVE_MMX
2081
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2082
        {
2083
                int counter= -2*dstW;
2084
                filter-= counter*2;
2085
                filterPos-= counter/2;
2086
                dst-= counter/2;
2087
                asm volatile(
2088
                        "pxor %%mm7, %%mm7                \n\t"
2089
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2090
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
2091
                        "movl %%eax, %%ebp                \n\t"
2092
                        ".balign 16                        \n\t"
2093
                        "1:                                \n\t"
2094
                        "movzwl (%2, %%ebp), %%eax        \n\t"
2095
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
2096
                        "movq (%1, %%ebp, 4), %%mm1        \n\t"
2097
                        "movq 8(%1, %%ebp, 4), %%mm3        \n\t"
2098
                        "movd (%3, %%eax), %%mm0        \n\t"
2099
                        "movd (%3, %%ebx), %%mm2        \n\t"
2100
                        "punpcklbw %%mm7, %%mm0                \n\t"
2101
                        "punpcklbw %%mm7, %%mm2                \n\t"
2102
                        "pmaddwd %%mm1, %%mm0                \n\t"
2103
                        "pmaddwd %%mm2, %%mm3                \n\t"
2104
                        "psrad $8, %%mm0                \n\t"
2105
                        "psrad $8, %%mm3                \n\t"
2106
                        "packssdw %%mm3, %%mm0                \n\t"
2107
                        "pmaddwd %%mm6, %%mm0                \n\t"
2108
                        "packssdw %%mm0, %%mm0                \n\t"
2109
                        "movd %%mm0, (%4, %%ebp)        \n\t"
2110
                        "addl $4, %%ebp                        \n\t"
2111
                        " jnc 1b                        \n\t"
2112

    
2113
                        "popl %%ebp                        \n\t"
2114
                        : "+a" (counter)
2115
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2116
                        : "%ebx"
2117
                );
2118
        }
2119
        else if(filterSize==8)
2120
        {
2121
                int counter= -2*dstW;
2122
                filter-= counter*4;
2123
                filterPos-= counter/2;
2124
                dst-= counter/2;
2125
                asm volatile(
2126
                        "pxor %%mm7, %%mm7                \n\t"
2127
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2128
                        "pushl %%ebp                        \n\t" // we use 7 regs here ...
2129
                        "movl %%eax, %%ebp                \n\t"
2130
                        ".balign 16                        \n\t"
2131
                        "1:                                \n\t"
2132
                        "movzwl (%2, %%ebp), %%eax        \n\t"
2133
                        "movzwl 2(%2, %%ebp), %%ebx        \n\t"
2134
                        "movq (%1, %%ebp, 8), %%mm1        \n\t"
2135
                        "movq 16(%1, %%ebp, 8), %%mm3        \n\t"
2136
                        "movd (%3, %%eax), %%mm0        \n\t"
2137
                        "movd (%3, %%ebx), %%mm2        \n\t"
2138
                        "punpcklbw %%mm7, %%mm0                \n\t"
2139
                        "punpcklbw %%mm7, %%mm2                \n\t"
2140
                        "pmaddwd %%mm1, %%mm0                \n\t"
2141
                        "pmaddwd %%mm2, %%mm3                \n\t"
2142

    
2143
                        "movq 8(%1, %%ebp, 8), %%mm1        \n\t"
2144
                        "movq 24(%1, %%ebp, 8), %%mm5        \n\t"
2145
                        "movd 4(%3, %%eax), %%mm4        \n\t"
2146
                        "movd 4(%3, %%ebx), %%mm2        \n\t"
2147
                        "punpcklbw %%mm7, %%mm4                \n\t"
2148
                        "punpcklbw %%mm7, %%mm2                \n\t"
2149
                        "pmaddwd %%mm1, %%mm4                \n\t"
2150
                        "pmaddwd %%mm2, %%mm5                \n\t"
2151
                        "paddd %%mm4, %%mm0                \n\t"
2152
                        "paddd %%mm5, %%mm3                \n\t"
2153
                                                
2154
                        "psrad $8, %%mm0                \n\t"
2155
                        "psrad $8, %%mm3                \n\t"
2156
                        "packssdw %%mm3, %%mm0                \n\t"
2157
                        "pmaddwd %%mm6, %%mm0                \n\t"
2158
                        "packssdw %%mm0, %%mm0                \n\t"
2159
                        "movd %%mm0, (%4, %%ebp)        \n\t"
2160
                        "addl $4, %%ebp                        \n\t"
2161
                        " jnc 1b                        \n\t"
2162

    
2163
                        "popl %%ebp                        \n\t"
2164
                        : "+a" (counter)
2165
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2166
                        : "%ebx"
2167
                );
2168
        }
2169
        else
2170
        {
2171
                int counter= -2*dstW;
2172
//                filter-= counter*filterSize/2;
2173
                filterPos-= counter/2;
2174
                dst-= counter/2;
2175
                asm volatile(
2176
                        "pxor %%mm7, %%mm7                \n\t"
2177
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2178
                        ".balign 16                        \n\t"
2179
                        "1:                                \n\t"
2180
                        "movl %2, %%ecx                        \n\t"
2181
                        "movzwl (%%ecx, %0), %%eax        \n\t"
2182
                        "movzwl 2(%%ecx, %0), %%ebx        \n\t"
2183
                        "movl %5, %%ecx                        \n\t"
2184
                        "pxor %%mm4, %%mm4                \n\t"
2185
                        "pxor %%mm5, %%mm5                \n\t"
2186
                        "2:                                \n\t"
2187
                        "movq (%1), %%mm1                \n\t"
2188
                        "movq (%1, %6), %%mm3                \n\t"
2189
                        "movd (%%ecx, %%eax), %%mm0        \n\t"
2190
                        "movd (%%ecx, %%ebx), %%mm2        \n\t"
2191
                        "punpcklbw %%mm7, %%mm0                \n\t"
2192
                        "punpcklbw %%mm7, %%mm2                \n\t"
2193
                        "pmaddwd %%mm1, %%mm0                \n\t"
2194
                        "pmaddwd %%mm2, %%mm3                \n\t"
2195
                        "paddd %%mm3, %%mm5                \n\t"
2196
                        "paddd %%mm0, %%mm4                \n\t"
2197
                        "addl $8, %1                        \n\t"
2198
                        "addl $4, %%ecx                        \n\t"
2199
                        "cmpl %4, %%ecx                        \n\t"
2200
                        " jb 2b                                \n\t"
2201
                        "addl %6, %1                        \n\t"
2202
                        "psrad $8, %%mm4                \n\t"
2203
                        "psrad $8, %%mm5                \n\t"
2204
                        "packssdw %%mm5, %%mm4                \n\t"
2205
                        "pmaddwd %%mm6, %%mm4                \n\t"
2206
                        "packssdw %%mm4, %%mm4                \n\t"
2207
                        "movl %3, %%eax                        \n\t"
2208
                        "movd %%mm4, (%%eax, %0)        \n\t"
2209
                        "addl $4, %0                        \n\t"
2210
                        " jnc 1b                        \n\t"
2211

    
2212
                        : "+r" (counter), "+r" (filter)
2213
                        : "m" (filterPos), "m" (dst), "m"(src+filterSize),
2214
                          "m" (src), "r" (filterSize*2)
2215
                        : "%ebx", "%eax", "%ecx"
2216
                );
2217
        }
2218
#else
2219
        int i;
2220
        for(i=0; i<dstW; i++)
2221
        {
2222
                int j;
2223
                int srcPos= filterPos[i];
2224
                int val=0;
2225
//                printf("filterPos: %d\n", filterPos[i]);
2226
                for(j=0; j<filterSize; j++)
2227
                {
2228
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2229
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2230
                }
2231
//                filter += hFilterSize;
2232
                dst[i] = MIN(MAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2233
//                dst[i] = val>>7;
2234
        }
2235
#endif
2236
}
2237
      // *** horizontal scale Y line to temp buffer
2238
static inline void RENAME(hyscale)(uint16_t *dst, int dstWidth, uint8_t *src, int srcW, int xInc,
2239
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2240
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2241
                                   int srcFormat, uint8_t *formatConvBuffer)
2242
{
2243
    if(srcFormat==IMGFMT_YUY2)
2244
    {
2245
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2246
        src= formatConvBuffer;
2247
    }
2248
    else if(srcFormat==IMGFMT_BGR32)
2249
    {
2250
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2251
        src= formatConvBuffer;
2252
    }
2253
    else if(srcFormat==IMGFMT_BGR24)
2254
    {
2255
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2256
        src= formatConvBuffer;
2257
    }
2258
    else if(srcFormat==IMGFMT_BGR16)
2259
    {
2260
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2261
        src= formatConvBuffer;
2262
    }
2263
    else if(srcFormat==IMGFMT_BGR15)
2264
    {
2265
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2266
        src= formatConvBuffer;
2267
    }
2268
    else if(srcFormat==IMGFMT_RGB32)
2269
    {
2270
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2271
        src= formatConvBuffer;
2272
    }
2273
    else if(srcFormat==IMGFMT_RGB24)
2274
    {
2275
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2276
        src= formatConvBuffer;
2277
    }
2278

    
2279
#ifdef HAVE_MMX
2280
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2281
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2282
#else
2283
    if(!(flags&SWS_FAST_BILINEAR))
2284
#endif
2285
    {
2286
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2287
    }
2288
    else // Fast Bilinear upscale / crap downscale
2289
    {
2290
#ifdef ARCH_X86
2291
#ifdef HAVE_MMX2
2292
        int i;
2293
        if(canMMX2BeUsed)
2294
        {
2295
                asm volatile(
2296
                        "pxor %%mm7, %%mm7                \n\t"
2297
                        "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
2298
                        "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
2299
                        "punpcklwd %%mm6, %%mm6                \n\t"
2300
                        "punpcklwd %%mm6, %%mm6                \n\t"
2301
                        "movq %%mm6, %%mm2                \n\t"
2302
                        "psllq $16, %%mm2                \n\t"
2303
                        "paddw %%mm6, %%mm2                \n\t"
2304
                        "psllq $16, %%mm2                \n\t"
2305
                        "paddw %%mm6, %%mm2                \n\t"
2306
                        "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFF
2307
                        "movq %%mm2, %%mm4                \n\t"
2308
                        "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
2309
                        "punpcklwd %%mm6, %%mm6                \n\t"
2310
                        "punpcklwd %%mm6, %%mm6                \n\t"
2311
                        "xorl %%eax, %%eax                \n\t" // i
2312
                        "movl %0, %%esi                        \n\t" // src
2313
                        "movl %1, %%edi                        \n\t" // buf1
2314
                        "movl %3, %%edx                        \n\t" // (xInc*4)>>16
2315
                        "xorl %%ecx, %%ecx                \n\t"
2316
                        "xorl %%ebx, %%ebx                \n\t"
2317
                        "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
2318

    
2319
#define FUNNY_Y_CODE \
2320
                        PREFETCH" 1024(%%esi)                \n\t"\
2321
                        PREFETCH" 1056(%%esi)                \n\t"\
2322
                        PREFETCH" 1088(%%esi)                \n\t"\
2323
                        "call *%6                        \n\t"\
2324
                        "movq %%mm4, %%mm2                \n\t"\
2325
                        "xorl %%ecx, %%ecx                \n\t"
2326

    
2327
FUNNY_Y_CODE
2328
FUNNY_Y_CODE
2329
FUNNY_Y_CODE
2330
FUNNY_Y_CODE
2331
FUNNY_Y_CODE
2332
FUNNY_Y_CODE
2333
FUNNY_Y_CODE
2334
FUNNY_Y_CODE
2335

    
2336
                        :: "m" (src), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
2337
                        "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (funnyYCode)
2338
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2339
                );
2340
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2341
        }
2342
        else
2343
        {
2344
#endif
2345
        //NO MMX just normal asm ...
2346
        asm volatile(
2347
                "xorl %%eax, %%eax                \n\t" // i
2348
                "xorl %%ebx, %%ebx                \n\t" // xx
2349
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2350
                ".balign 16                        \n\t"
2351
                "1:                                \n\t"
2352
                "movzbl  (%0, %%ebx), %%edi        \n\t" //src[xx]
2353
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
2354
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2355
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2356
                "shll $16, %%edi                \n\t"
2357
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2358
                "movl %1, %%edi                        \n\t"
2359
                "shrl $9, %%esi                        \n\t"
2360
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
2361
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2362
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2363

    
2364
                "movzbl (%0, %%ebx), %%edi        \n\t" //src[xx]
2365
                "movzbl 1(%0, %%ebx), %%esi        \n\t" //src[xx+1]
2366
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2367
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2368
                "shll $16, %%edi                \n\t"
2369
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2370
                "movl %1, %%edi                        \n\t"
2371
                "shrl $9, %%esi                        \n\t"
2372
                "movw %%si, 2(%%edi, %%eax, 2)        \n\t"
2373
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2374
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2375

    
2376

    
2377
                "addl $2, %%eax                        \n\t"
2378
                "cmpl %2, %%eax                        \n\t"
2379
                " jb 1b                                \n\t"
2380

    
2381

    
2382
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF)
2383
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2384
                );
2385
#ifdef HAVE_MMX2
2386
        } //if MMX2 cant be used
2387
#endif
2388
#else
2389
        int i;
2390
        unsigned int xpos=0;
2391
        for(i=0;i<dstWidth;i++)
2392
        {
2393
                register unsigned int xx=xpos>>16;
2394
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2395
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2396
                xpos+=xInc;
2397
        }
2398
#endif
2399
    }
2400
}
2401

    
2402
inline static void RENAME(hcscale)(uint16_t *dst, int dstWidth, uint8_t *src1, uint8_t *src2,
2403
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2404
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2405
                                   int srcFormat, uint8_t *formatConvBuffer)
2406
{
2407
    if(srcFormat==IMGFMT_YUY2)
2408
    {
2409
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2410
        src1= formatConvBuffer;
2411
        src2= formatConvBuffer+2048;
2412
    }
2413
    else if(srcFormat==IMGFMT_BGR32)
2414
    {
2415
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2416
        src1= formatConvBuffer;
2417
        src2= formatConvBuffer+2048;
2418
    }
2419
    else if(srcFormat==IMGFMT_BGR24)
2420
    {
2421
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2422
        src1= formatConvBuffer;
2423
        src2= formatConvBuffer+2048;
2424
    }
2425
    else if(srcFormat==IMGFMT_BGR16)
2426
    {
2427
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2428
        src1= formatConvBuffer;
2429
        src2= formatConvBuffer+2048;
2430
    }
2431
    else if(srcFormat==IMGFMT_BGR15)
2432
    {
2433
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2434
        src1= formatConvBuffer;
2435
        src2= formatConvBuffer+2048;
2436
    }
2437
    else if(srcFormat==IMGFMT_RGB32)
2438
    {
2439
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2440
        src1= formatConvBuffer;
2441
        src2= formatConvBuffer+2048;
2442
    }
2443
    else if(srcFormat==IMGFMT_RGB24)
2444
    {
2445
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2446
        src1= formatConvBuffer;
2447
        src2= formatConvBuffer+2048;
2448
    }
2449
    else if(isGray(srcFormat))
2450
    {
2451
            return;
2452
    }
2453

    
2454
#ifdef HAVE_MMX
2455
        // use the new MMX scaler if th mmx2 cant be used (its faster than the x86asm one)
2456
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2457
#else
2458
    if(!(flags&SWS_FAST_BILINEAR))
2459
#endif
2460
    {
2461
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2462
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2463
    }
2464
    else // Fast Bilinear upscale / crap downscale
2465
    {
2466
#ifdef ARCH_X86
2467
#ifdef HAVE_MMX2
2468
        int i;
2469
        if(canMMX2BeUsed)
2470
        {
2471
                asm volatile(
2472
                "pxor %%mm7, %%mm7                \n\t"
2473
                "pxor %%mm2, %%mm2                \n\t" // 2*xalpha
2474
                "movd %5, %%mm6                        \n\t" // xInc&0xFFFF
2475
                "punpcklwd %%mm6, %%mm6                \n\t"
2476
                "punpcklwd %%mm6, %%mm6                \n\t"
2477
                "movq %%mm6, %%mm2                \n\t"
2478
                "psllq $16, %%mm2                \n\t"
2479
                "paddw %%mm6, %%mm2                \n\t"
2480
                "psllq $16, %%mm2                \n\t"
2481
                "paddw %%mm6, %%mm2                \n\t"
2482
                "psllq $16, %%mm2                \n\t" //0,t,2t,3t                t=xInc&0xFFFF
2483
                "movq %%mm2, %%mm4                \n\t"
2484
                "movd %4, %%mm6                        \n\t" //(xInc*4)&0xFFFF
2485
                "punpcklwd %%mm6, %%mm6                \n\t"
2486
                "punpcklwd %%mm6, %%mm6                \n\t"
2487
                "xorl %%eax, %%eax                \n\t" // i
2488
                "movl %0, %%esi                        \n\t" // src
2489
                "movl %1, %%edi                        \n\t" // buf1
2490
                "movl %3, %%edx                        \n\t" // (xInc*4)>>16
2491
                "xorl %%ecx, %%ecx                \n\t"
2492
                "xorl %%ebx, %%ebx                \n\t"
2493
                "movw %4, %%bx                        \n\t" // (xInc*4)&0xFFFF
2494

    
2495
#define FUNNYUVCODE \
2496
                        PREFETCH" 1024(%%esi)                \n\t"\
2497
                        PREFETCH" 1056(%%esi)                \n\t"\
2498
                        PREFETCH" 1088(%%esi)                \n\t"\
2499
                        "call *%7                        \n\t"\
2500
                        "movq %%mm4, %%mm2        \n\t"\
2501
                        "xorl %%ecx, %%ecx                \n\t"
2502

    
2503
FUNNYUVCODE
2504
FUNNYUVCODE
2505
FUNNYUVCODE
2506
FUNNYUVCODE
2507

    
2508
FUNNYUVCODE
2509
FUNNYUVCODE
2510
FUNNYUVCODE
2511
FUNNYUVCODE
2512
                "xorl %%eax, %%eax                \n\t" // i
2513
                "movl %6, %%esi                        \n\t" // src
2514
                "movl %1, %%edi                        \n\t" // buf1
2515
                "addl $4096, %%edi                \n\t"
2516

    
2517
FUNNYUVCODE
2518
FUNNYUVCODE
2519
FUNNYUVCODE
2520
FUNNYUVCODE
2521

    
2522
FUNNYUVCODE
2523
FUNNYUVCODE
2524
FUNNYUVCODE
2525
FUNNYUVCODE
2526

    
2527
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" ((xInc*4)>>16),
2528
                  "m" ((xInc*4)&0xFFFF), "m" (xInc&0xFFFF), "m" (src2), "m" (funnyUVCode)
2529
                : "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi"
2530
        );
2531
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2532
                {
2533
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2534
                        dst[i] = src1[srcW-1]*128;
2535
                        dst[i+2048] = src2[srcW-1]*128;
2536
                }
2537
        }
2538
        else
2539
        {
2540
#endif
2541
        asm volatile(
2542
                "xorl %%eax, %%eax                \n\t" // i
2543
                "xorl %%ebx, %%ebx                \n\t" // xx
2544
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2545
                ".balign 16                        \n\t"
2546
                "1:                                \n\t"
2547
                "movl %0, %%esi                        \n\t"
2548
                "movzbl  (%%esi, %%ebx), %%edi        \n\t" //src[xx]
2549
                "movzbl 1(%%esi, %%ebx), %%esi        \n\t" //src[xx+1]
2550
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2551
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2552
                "shll $16, %%edi                \n\t"
2553
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2554
                "movl %1, %%edi                        \n\t"
2555
                "shrl $9, %%esi                        \n\t"
2556
                "movw %%si, (%%edi, %%eax, 2)        \n\t"
2557

    
2558
                "movzbl  (%5, %%ebx), %%edi        \n\t" //src[xx]
2559
                "movzbl 1(%5, %%ebx), %%esi        \n\t" //src[xx+1]
2560
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2561
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2562
                "shll $16, %%edi                \n\t"
2563
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2564
                "movl %1, %%edi                        \n\t"
2565
                "shrl $9, %%esi                        \n\t"
2566
                "movw %%si, 4096(%%edi, %%eax, 2)\n\t"
2567

    
2568
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2569
                "adcl %3, %%ebx                        \n\t" //xx+= xInc>>8 + carry
2570
                "addl $1, %%eax                        \n\t"
2571
                "cmpl %2, %%eax                        \n\t"
2572
                " jb 1b                                \n\t"
2573

    
2574
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc>>16), "m" (xInc&0xFFFF),
2575
                "r" (src2)
2576
                : "%eax", "%ebx", "%ecx", "%edi", "%esi"
2577
                );
2578
#ifdef HAVE_MMX2
2579
        } //if MMX2 cant be used
2580
#endif
2581
#else
2582
        int i;
2583
        unsigned int xpos=0;
2584
        for(i=0;i<dstWidth;i++)
2585
        {
2586
                register unsigned int xx=xpos>>16;
2587
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2588
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2589
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2590
/* slower
2591
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2592
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2593
*/
2594
                xpos+=xInc;
2595
        }
2596
#endif
2597
   }
2598
}
2599

    
2600
static void RENAME(swScale)(SwsContext *c, uint8_t* srcParam[], int srcStrideParam[], int srcSliceY,
2601
             int srcSliceH, uint8_t* dstParam[], int dstStrideParam[]){
2602

    
2603
        /* load a few things into local vars to make the code more readable? and faster */
2604
        const int srcW= c->srcW;
2605
        const int dstW= c->dstW;
2606
        const int dstH= c->dstH;
2607
        const int chrDstW= c->chrDstW;
2608
        const int lumXInc= c->lumXInc;
2609
        const int chrXInc= c->chrXInc;
2610
        const int dstFormat= c->dstFormat;
2611
        const int flags= c->flags;
2612
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2613
        int16_t *vLumFilterPos= c->vLumFilterPos;
2614
        int16_t *vChrFilterPos= c->vChrFilterPos;
2615
        int16_t *hLumFilterPos= c->hLumFilterPos;
2616
        int16_t *hChrFilterPos= c->hChrFilterPos;
2617
        int16_t *vLumFilter= c->vLumFilter;
2618
        int16_t *vChrFilter= c->vChrFilter;
2619
        int16_t *hLumFilter= c->hLumFilter;
2620
        int16_t *hChrFilter= c->hChrFilter;
2621
        int16_t *lumMmxFilter= c->lumMmxFilter;
2622
        int16_t *chrMmxFilter= c->chrMmxFilter;
2623
        const int vLumFilterSize= c->vLumFilterSize;
2624
        const int vChrFilterSize= c->vChrFilterSize;
2625
        const int hLumFilterSize= c->hLumFilterSize;
2626
        const int hChrFilterSize= c->hChrFilterSize;
2627
        int16_t **lumPixBuf= c->lumPixBuf;
2628
        int16_t **chrPixBuf= c->chrPixBuf;
2629
        const int vLumBufSize= c->vLumBufSize;
2630
        const int vChrBufSize= c->vChrBufSize;
2631
        uint8_t *funnyYCode= c->funnyYCode;
2632
        uint8_t *funnyUVCode= c->funnyUVCode;
2633
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2634

    
2635
        /* vars whch will change and which we need to storw back in the context */
2636
        int dstY= c->dstY;
2637
        int lumBufIndex= c->lumBufIndex;
2638
        int chrBufIndex= c->chrBufIndex;
2639
        int lastInLumBuf= c->lastInLumBuf;
2640
        int lastInChrBuf= c->lastInChrBuf;
2641
        int srcStride[3];
2642
        int dstStride[3];
2643
        uint8_t *src[3];
2644
        uint8_t *dst[3];
2645
        
2646
        if(c->srcFormat == IMGFMT_I420){
2647
                src[0]= srcParam[0];
2648
                src[1]= srcParam[2];
2649
                src[2]= srcParam[1];
2650
                srcStride[0]= srcStrideParam[0];
2651
                srcStride[1]= srcStrideParam[2];
2652
                srcStride[2]= srcStrideParam[1];
2653
        }
2654
        else if(c->srcFormat==IMGFMT_YV12){
2655
                src[0]= srcParam[0];
2656
                src[1]= srcParam[1];
2657
                src[2]= srcParam[2];
2658
                srcStride[0]= srcStrideParam[0];
2659
                srcStride[1]= srcStrideParam[1];
2660
                srcStride[2]= srcStrideParam[2];
2661
        }
2662
        else if(isPacked(c->srcFormat)){
2663
                src[0]=
2664
                src[1]=
2665
                src[2]= srcParam[0];
2666
                srcStride[0]= srcStrideParam[0];
2667
                srcStride[1]=
2668
                srcStride[2]= srcStrideParam[0]<<1;
2669
        }
2670
        else if(isGray(c->srcFormat)){
2671
                src[0]= srcParam[0];
2672
                src[1]=
2673
                src[2]= NULL;
2674
                srcStride[0]= srcStrideParam[0];
2675
                srcStride[1]=
2676
                srcStride[2]= 0;
2677
        }
2678

    
2679
        if(dstFormat == IMGFMT_I420){
2680
                dst[0]= dstParam[0];
2681
                dst[1]= dstParam[2];
2682
                dst[2]= dstParam[1];
2683
                dstStride[0]= dstStrideParam[0];
2684
                dstStride[1]= dstStrideParam[2];
2685
                dstStride[2]= dstStrideParam[1];
2686
        }else{
2687
                dst[0]= dstParam[0];
2688
                dst[1]= dstParam[1];
2689
                dst[2]= dstParam[2];
2690
                dstStride[0]= dstStrideParam[0];
2691
                dstStride[1]= dstStrideParam[1];
2692
                dstStride[2]= dstStrideParam[2];
2693
        }
2694

    
2695
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2696
//dstStride[0],dstStride[1],dstStride[2]);
2697

    
2698
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2699
        {
2700
                static int firstTime=1; //FIXME move this into the context perhaps
2701
                if(flags & SWS_PRINT_INFO && firstTime)
2702
                {
2703
                        fprintf(stderr, "SwScaler: Warning: dstStride is not aligned!\n"
2704
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2705
                        firstTime=0;
2706
                }
2707
        }
2708

    
2709
        /* Note the user might start scaling the picture in the middle so this will not get executed
2710
           this is not really intended but works currently, so ppl might do it */
2711
        if(srcSliceY ==0){
2712
                lumBufIndex=0;
2713
                chrBufIndex=0;
2714
                dstY=0;        
2715
                lastInLumBuf= -1;
2716
                lastInChrBuf= -1;
2717
        }
2718

    
2719
        for(;dstY < dstH; dstY++){
2720
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2721
                unsigned char *uDest=dst[1]+dstStride[1]*(dstY>>1);
2722
                unsigned char *vDest=dst[2]+dstStride[2]*(dstY>>1);
2723
                const int chrDstY= isHalfChrV(dstFormat) ? (dstY>>1) : dstY;
2724

    
2725
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2726
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2727
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2728
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2729

    
2730
                //handle holes (FAST_BILINEAR & weird filters)
2731
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2732
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2733
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2734
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2735
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2736

    
2737
                // Do we have enough lines in this slice to output the dstY line
2738
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < ((srcSliceY + srcSliceH + 1)>>1))
2739
                {
2740
                        //Do horizontal scaling
2741
                        while(lastInLumBuf < lastLumSrcY)
2742
                        {
2743
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2744
                                lumBufIndex++;
2745
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2746
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2747
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2748
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2749
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2750
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2751
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2752
                                                funnyYCode, c->srcFormat, formatConvBuffer);
2753
                                lastInLumBuf++;
2754
                        }
2755
                        while(lastInChrBuf < lastChrSrcY)
2756
                        {
2757
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2758
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2759
                                chrBufIndex++;
2760
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2761
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2762
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2763
                                //FIXME replace parameters through context struct (some at least)
2764
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2765
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2766
                                                funnyUVCode, c->srcFormat, formatConvBuffer);
2767
                                lastInChrBuf++;
2768
                        }
2769
                        //wrap buf index around to stay inside the ring buffer
2770
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2771
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2772
                }
2773
                else // not enough lines left in this slice -> load the rest in the buffer
2774
                {
2775
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2776
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2777
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2778
                        vChrBufSize, vLumBufSize);
2779
*/
2780
                        //Do horizontal scaling
2781
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2782
                        {
2783
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2784
                                lumBufIndex++;
2785
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2786
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2787
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2788
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2789
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2790
                                                funnyYCode, c->srcFormat, formatConvBuffer);
2791
                                lastInLumBuf++;
2792
                        }
2793
                        while(lastInChrBuf+1 < ((srcSliceY + srcSliceH)>>1))
2794
                        {
2795
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[1];
2796
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - (srcSliceY>>1))*srcStride[2];
2797
                                chrBufIndex++;
2798
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2799
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) < ((srcSliceH+1)>>1))
2800
                                ASSERT(lastInChrBuf + 1 - (srcSliceY>>1) >= 0)
2801
                                RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, (srcW+1)>>1, chrXInc,
2802
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2803
                                                funnyUVCode, c->srcFormat, formatConvBuffer);
2804
                                lastInChrBuf++;
2805
                        }
2806
                        //wrap buf index around to stay inside the ring buffer
2807
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2808
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2809
                        break; //we cant output a dstY line so lets try with the next slice
2810
                }
2811

    
2812
#ifdef HAVE_MMX
2813
                b5Dither= dither8[dstY&1];
2814
                g6Dither= dither4[dstY&1];
2815
                g5Dither= dither8[dstY&1];
2816
                r5Dither= dither8[(dstY+1)&1];
2817
#endif
2818
            if(dstY < dstH-2)
2819
            {
2820
                if(isPlanarYUV(dstFormat)) //YV12 like
2821
                {
2822
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2823
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
2824
                        {
2825
                                int16_t *lumBuf = lumPixBuf[0];
2826
                                int16_t *chrBuf= chrPixBuf[0];
2827
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW);
2828
                        }
2829
                        else //General YV12
2830
                        {
2831
                                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2832
                                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2833
                                RENAME(yuv2yuvX)(
2834
                                        vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2835
                                        vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2836
                                        dest, uDest, vDest, dstW,
2837
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+(dstY>>1)*vChrFilterSize*4);
2838
                        }
2839
                }
2840
                else
2841
                {
2842
                        int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2843
                        int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2844

    
2845
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2846
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2847
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
2848
                        {
2849
                                int chrAlpha= vChrFilter[2*dstY+1];
2850

    
2851
                                RENAME(yuv2rgb1)(*lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2852
                                                 dest, dstW, chrAlpha, dstFormat, flags);
2853
                        }
2854
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
2855
                        {
2856
                                int lumAlpha= vLumFilter[2*dstY+1];
2857
                                int chrAlpha= vChrFilter[2*dstY+1];
2858

    
2859
                                RENAME(yuv2rgb2)(*lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2860
                                                 dest, dstW, lumAlpha, chrAlpha, dstFormat, flags);
2861
                        }
2862
                        else //General RGB
2863
                        {
2864
                                RENAME(yuv2rgbX)(
2865
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2866
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2867
                                        dest, dstW, dstFormat,
2868
                                        lumMmxFilter+dstY*vLumFilterSize*4, chrMmxFilter+dstY*vChrFilterSize*4);
2869
                        }
2870
                }
2871
            }
2872
            else // hmm looks like we cant use MMX here without overwriting this arrays tail
2873
            {
2874
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2875
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2876
                if(isPlanarYUV(dstFormat)) //YV12
2877
                {
2878
                        if(dstY&1) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2879
                        yuv2yuvXinC(
2880
                                vLumFilter+dstY*vLumFilterSize     , lumSrcPtr, vLumFilterSize,
2881
                                vChrFilter+(dstY>>1)*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2882
                                dest, uDest, vDest, dstW);
2883
                }
2884
                else
2885
                {
2886
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2887
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2888
                        yuv2rgbXinC(
2889
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2890
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2891
                                dest, dstW, dstFormat);
2892
                }
2893
            }
2894
        }
2895

    
2896
#ifdef HAVE_MMX
2897
        __asm __volatile(SFENCE:::"memory");
2898
        __asm __volatile(EMMS:::"memory");
2899
#endif
2900
        /* store changed local vars back in the context */
2901
        c->dstY= dstY;
2902
        c->lumBufIndex= lumBufIndex;
2903
        c->chrBufIndex= chrBufIndex;
2904
        c->lastInLumBuf= lastInLumBuf;
2905
        c->lastInChrBuf= lastInChrBuf;
2906
}