Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ e28630fc

History | View | Annotate | Download (105 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * the C code (not assembly, mmx, ...) of this file can be used
21
 * under the LGPL license too
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined ( HAVE_MMX2 )
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
                asm volatile(\
75
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
76
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
77
                        "movq %%mm3, %%mm4                \n\t"\
78
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
79
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
80
                        ASMALIGN(4) /* FIXME Unroll? */\
81
                        "1:                                \n\t"\
82
                        "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
83
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
84
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
85
                        "add $16, %%"REG_d"                \n\t"\
86
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
87
                        "test %%"REG_S", %%"REG_S"        \n\t"\
88
                        "pmulhw %%mm0, %%mm2                \n\t"\
89
                        "pmulhw %%mm0, %%mm5                \n\t"\
90
                        "paddw %%mm2, %%mm3                \n\t"\
91
                        "paddw %%mm5, %%mm4                \n\t"\
92
                        " jnz 1b                        \n\t"\
93
                        "psraw $3, %%mm3                \n\t"\
94
                        "psraw $3, %%mm4                \n\t"\
95
                        "packuswb %%mm4, %%mm3                \n\t"\
96
                        MOVNTQ(%%mm3, (%1, %%REGa))\
97
                        "add $8, %%"REG_a"                \n\t"\
98
                        "cmp %2, %%"REG_a"                \n\t"\
99
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
100
                        "movq %%mm3, %%mm4                \n\t"\
101
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
102
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
103
                        "jb 1b                                \n\t"\
104
                        :: "r" (&c->redDither),\
105
                        "r" (dest), "g" (width)\
106
                        : "%"REG_a, "%"REG_d, "%"REG_S\
107
                );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
                asm volatile(\
111
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
112
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
113
                        "pxor %%mm4, %%mm4              \n\t"\
114
                        "pxor %%mm5, %%mm5              \n\t"\
115
                        "pxor %%mm6, %%mm6              \n\t"\
116
                        "pxor %%mm7, %%mm7              \n\t"\
117
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
118
                        ASMALIGN(4) \
119
                        "1:                                \n\t"\
120
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
121
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
122
                        "mov 4(%%"REG_d"), %%"REG_S"        \n\t"\
123
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
124
                        "movq %%mm0, %%mm3              \n\t"\
125
                        "punpcklwd %%mm1, %%mm0        \n\t"\
126
                        "punpckhwd %%mm1, %%mm3        \n\t"\
127
                        "movq 8(%%"REG_d"), %%mm1        \n\t" /* filterCoeff */\
128
                        "pmaddwd %%mm1, %%mm0           \n\t"\
129
                        "pmaddwd %%mm1, %%mm3           \n\t"\
130
                        "paddd %%mm0, %%mm4             \n\t"\
131
                        "paddd %%mm3, %%mm5             \n\t"\
132
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
133
                        "mov 16(%%"REG_d"), %%"REG_S"        \n\t"\
134
                        "add $16, %%"REG_d"                \n\t"\
135
                        "test %%"REG_S", %%"REG_S"      \n\t"\
136
                        "movq %%mm2, %%mm0              \n\t"\
137
                        "punpcklwd %%mm3, %%mm2        \n\t"\
138
                        "punpckhwd %%mm3, %%mm0        \n\t"\
139
                        "pmaddwd %%mm1, %%mm2           \n\t"\
140
                        "pmaddwd %%mm1, %%mm0           \n\t"\
141
                        "paddd %%mm2, %%mm6             \n\t"\
142
                        "paddd %%mm0, %%mm7             \n\t"\
143
                        " jnz 1b                        \n\t"\
144
                        "psrad $16, %%mm4                \n\t"\
145
                        "psrad $16, %%mm5                \n\t"\
146
                        "psrad $16, %%mm6                \n\t"\
147
                        "psrad $16, %%mm7                \n\t"\
148
                        "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
149
                        "packssdw %%mm5, %%mm4                \n\t"\
150
                        "packssdw %%mm7, %%mm6                \n\t"\
151
                        "paddw %%mm0, %%mm4             \n\t"\
152
                        "paddw %%mm0, %%mm6             \n\t"\
153
                        "psraw $3, %%mm4                \n\t"\
154
                        "psraw $3, %%mm6                \n\t"\
155
                        "packuswb %%mm6, %%mm4                \n\t"\
156
                        MOVNTQ(%%mm4, (%1, %%REGa))\
157
                        "add $8, %%"REG_a"                \n\t"\
158
                        "cmp %2, %%"REG_a"                \n\t"\
159
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
160
                        "pxor %%mm4, %%mm4              \n\t"\
161
                        "pxor %%mm5, %%mm5              \n\t"\
162
                        "pxor %%mm6, %%mm6              \n\t"\
163
                        "pxor %%mm7, %%mm7              \n\t"\
164
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
165
                        "jb 1b                                \n\t"\
166
                        :: "r" (&c->redDither),\
167
                        "r" (dest), "g" (width)\
168
                        : "%"REG_a, "%"REG_d, "%"REG_S\
169
                );
170

    
171
#define YSCALEYUV2YV121 \
172
                        "mov %2, %%"REG_a"                \n\t"\
173
                        ASMALIGN(4) /* FIXME Unroll? */\
174
                        "1:                                \n\t"\
175
                        "movq (%0, %%"REG_a", 2), %%mm0        \n\t"\
176
                        "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
177
                        "psraw $7, %%mm0                \n\t"\
178
                        "psraw $7, %%mm1                \n\t"\
179
                        "packuswb %%mm1, %%mm0                \n\t"\
180
                        MOVNTQ(%%mm0, (%1, %%REGa))\
181
                        "add $8, %%"REG_a"                \n\t"\
182
                        "jnc 1b                                \n\t"
183

    
184
/*
185
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
                           "r" (dest), "m" (dstW),
188
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190
*/
191
#define YSCALEYUV2PACKEDX \
192
        asm volatile(\
193
                "xor %%"REG_a", %%"REG_a"        \n\t"\
194
                ASMALIGN(4)\
195
                "nop                                \n\t"\
196
                "1:                                \n\t"\
197
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
198
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
199
                "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
200
                "movq %%mm3, %%mm4                \n\t"\
201
                ASMALIGN(4)\
202
                "2:                                \n\t"\
203
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
204
                "movq (%%"REG_S", %%"REG_a"), %%mm2        \n\t" /* UsrcData */\
205
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm5        \n\t" /* VsrcData */\
206
                "add $16, %%"REG_d"                \n\t"\
207
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
208
                "pmulhw %%mm0, %%mm2                \n\t"\
209
                "pmulhw %%mm0, %%mm5                \n\t"\
210
                "paddw %%mm2, %%mm3                \n\t"\
211
                "paddw %%mm5, %%mm4                \n\t"\
212
                "test %%"REG_S", %%"REG_S"        \n\t"\
213
                " jnz 2b                        \n\t"\
214
\
215
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
216
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
217
                "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
218
                "movq %%mm1, %%mm7                \n\t"\
219
                ASMALIGN(4)\
220
                "2:                                \n\t"\
221
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
222
                "movq (%%"REG_S", %%"REG_a", 2), %%mm2        \n\t" /* Y1srcData */\
223
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5        \n\t" /* Y2srcData */\
224
                "add $16, %%"REG_d"                \n\t"\
225
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
226
                "pmulhw %%mm0, %%mm2                \n\t"\
227
                "pmulhw %%mm0, %%mm5                \n\t"\
228
                "paddw %%mm2, %%mm1                \n\t"\
229
                "paddw %%mm5, %%mm7                \n\t"\
230
                "test %%"REG_S", %%"REG_S"        \n\t"\
231
                " jnz 2b                        \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX_END\
234
        :: "r" (&c->redDither), \
235
            "m" (dummy), "m" (dummy), "m" (dummy),\
236
            "r" (dest), "m" (dstW)\
237
        : "%"REG_a, "%"REG_d, "%"REG_S\
238
        );
239

    
240
#define YSCALEYUV2PACKEDX_ACCURATE \
241
        asm volatile(\
242
                "xor %%"REG_a", %%"REG_a"        \n\t"\
243
                ASMALIGN(4)\
244
                "nop                                \n\t"\
245
                "1:                                \n\t"\
246
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
247
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
248
                "pxor %%mm4, %%mm4              \n\t"\
249
                "pxor %%mm5, %%mm5              \n\t"\
250
                "pxor %%mm6, %%mm6              \n\t"\
251
                "pxor %%mm7, %%mm7              \n\t"\
252
                ASMALIGN(4)\
253
                "2:                                \n\t"\
254
                "movq (%%"REG_S", %%"REG_a"), %%mm0        \n\t" /* UsrcData */\
255
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm2        \n\t" /* VsrcData */\
256
                "mov 4(%%"REG_d"), %%"REG_S"        \n\t"\
257
                "movq (%%"REG_S", %%"REG_a"), %%mm1        \n\t" /* UsrcData */\
258
                "movq %%mm0, %%mm3              \n\t"\
259
                "punpcklwd %%mm1, %%mm0        \n\t"\
260
                "punpckhwd %%mm1, %%mm3        \n\t"\
261
                "movq 8(%%"REG_d"), %%mm1        \n\t" /* filterCoeff */\
262
                "pmaddwd %%mm1, %%mm0           \n\t"\
263
                "pmaddwd %%mm1, %%mm3           \n\t"\
264
                "paddd %%mm0, %%mm4             \n\t"\
265
                "paddd %%mm3, %%mm5             \n\t"\
266
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm3        \n\t" /* VsrcData */\
267
                "mov 16(%%"REG_d"), %%"REG_S"        \n\t"\
268
                "add $16, %%"REG_d"                \n\t"\
269
                "test %%"REG_S", %%"REG_S"      \n\t"\
270
                "movq %%mm2, %%mm0              \n\t"\
271
                "punpcklwd %%mm3, %%mm2        \n\t"\
272
                "punpckhwd %%mm3, %%mm0        \n\t"\
273
                "pmaddwd %%mm1, %%mm2           \n\t"\
274
                "pmaddwd %%mm1, %%mm0           \n\t"\
275
                "paddd %%mm2, %%mm6             \n\t"\
276
                "paddd %%mm0, %%mm7             \n\t"\
277
                " jnz 2b                        \n\t"\
278
                "psrad $16, %%mm4                \n\t"\
279
                "psrad $16, %%mm5                \n\t"\
280
                "psrad $16, %%mm6                \n\t"\
281
                "psrad $16, %%mm7                \n\t"\
282
                "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
283
                "packssdw %%mm5, %%mm4                \n\t"\
284
                "packssdw %%mm7, %%mm6                \n\t"\
285
                "paddw %%mm0, %%mm4             \n\t"\
286
                "paddw %%mm0, %%mm6             \n\t"\
287
                "movq %%mm4, "U_TEMP"(%0)       \n\t"\
288
                "movq %%mm6, "V_TEMP"(%0)       \n\t"\
289
\
290
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
291
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
292
                "pxor %%mm1, %%mm1              \n\t"\
293
                "pxor %%mm5, %%mm5              \n\t"\
294
                "pxor %%mm7, %%mm7              \n\t"\
295
                "pxor %%mm6, %%mm6              \n\t"\
296
                ASMALIGN(4)\
297
                "2:                                \n\t"\
298
                "movq (%%"REG_S", %%"REG_a", 2), %%mm0        \n\t" /* Y1srcData */\
299
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2        \n\t" /* Y2srcData */\
300
                "mov 4(%%"REG_d"), %%"REG_S"        \n\t"\
301
                "movq (%%"REG_S", %%"REG_a", 2), %%mm4        \n\t" /* Y1srcData */\
302
                "movq %%mm0, %%mm3              \n\t"\
303
                "punpcklwd %%mm4, %%mm0        \n\t"\
304
                "punpckhwd %%mm4, %%mm3        \n\t"\
305
                "movq 8(%%"REG_d"), %%mm4        \n\t" /* filterCoeff */\
306
                "pmaddwd %%mm4, %%mm0           \n\t"\
307
                "pmaddwd %%mm4, %%mm3           \n\t"\
308
                "paddd %%mm0, %%mm1             \n\t"\
309
                "paddd %%mm3, %%mm5             \n\t"\
310
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3        \n\t" /* Y2srcData */\
311
                "mov 16(%%"REG_d"), %%"REG_S"        \n\t"\
312
                "add $16, %%"REG_d"                \n\t"\
313
                "test %%"REG_S", %%"REG_S"      \n\t"\
314
                "movq %%mm2, %%mm0              \n\t"\
315
                "punpcklwd %%mm3, %%mm2        \n\t"\
316
                "punpckhwd %%mm3, %%mm0        \n\t"\
317
                "pmaddwd %%mm4, %%mm2           \n\t"\
318
                "pmaddwd %%mm4, %%mm0           \n\t"\
319
                "paddd %%mm2, %%mm7             \n\t"\
320
                "paddd %%mm0, %%mm6             \n\t"\
321
                " jnz 2b                        \n\t"\
322
                "psrad $16, %%mm1                \n\t"\
323
                "psrad $16, %%mm5                \n\t"\
324
                "psrad $16, %%mm7                \n\t"\
325
                "psrad $16, %%mm6                \n\t"\
326
                "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
327
                "packssdw %%mm5, %%mm1                \n\t"\
328
                "packssdw %%mm6, %%mm7                \n\t"\
329
                "paddw %%mm0, %%mm1             \n\t"\
330
                "paddw %%mm0, %%mm7             \n\t"\
331
                "movq  "U_TEMP"(%0), %%mm3      \n\t"\
332
                "movq  "V_TEMP"(%0), %%mm4      \n\t"\
333

    
334
#define YSCALEYUV2RGBX \
335
                "psubw "U_OFFSET"(%0), %%mm3        \n\t" /* (U-128)8*/\
336
                "psubw "V_OFFSET"(%0), %%mm4        \n\t" /* (V-128)8*/\
337
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
338
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
339
                "pmulhw "UG_COEFF"(%0), %%mm3        \n\t"\
340
                "pmulhw "VG_COEFF"(%0), %%mm4        \n\t"\
341
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
                "pmulhw "UB_COEFF"(%0), %%mm2        \n\t"\
343
                "pmulhw "VR_COEFF"(%0), %%mm5        \n\t"\
344
                "psubw "Y_OFFSET"(%0), %%mm1        \n\t" /* 8(Y-16)*/\
345
                "psubw "Y_OFFSET"(%0), %%mm7        \n\t" /* 8(Y-16)*/\
346
                "pmulhw "Y_COEFF"(%0), %%mm1        \n\t"\
347
                "pmulhw "Y_COEFF"(%0), %%mm7        \n\t"\
348
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
                "paddw %%mm3, %%mm4                \n\t"\
350
                "movq %%mm2, %%mm0                \n\t"\
351
                "movq %%mm5, %%mm6                \n\t"\
352
                "movq %%mm4, %%mm3                \n\t"\
353
                "punpcklwd %%mm2, %%mm2                \n\t"\
354
                "punpcklwd %%mm5, %%mm5                \n\t"\
355
                "punpcklwd %%mm4, %%mm4                \n\t"\
356
                "paddw %%mm1, %%mm2                \n\t"\
357
                "paddw %%mm1, %%mm5                \n\t"\
358
                "paddw %%mm1, %%mm4                \n\t"\
359
                "punpckhwd %%mm0, %%mm0                \n\t"\
360
                "punpckhwd %%mm6, %%mm6                \n\t"\
361
                "punpckhwd %%mm3, %%mm3                \n\t"\
362
                "paddw %%mm7, %%mm0                \n\t"\
363
                "paddw %%mm7, %%mm6                \n\t"\
364
                "paddw %%mm7, %%mm3                \n\t"\
365
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
                "packuswb %%mm0, %%mm2                \n\t"\
367
                "packuswb %%mm6, %%mm5                \n\t"\
368
                "packuswb %%mm3, %%mm4                \n\t"\
369
                "pxor %%mm7, %%mm7                \n\t"
370
#if 0
371
#define FULL_YSCALEYUV2RGB \
372
                "pxor %%mm7, %%mm7                \n\t"\
373
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
374
                "punpcklwd %%mm6, %%mm6                \n\t"\
375
                "punpcklwd %%mm6, %%mm6                \n\t"\
376
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
377
                "punpcklwd %%mm5, %%mm5                \n\t"\
378
                "punpcklwd %%mm5, %%mm5                \n\t"\
379
                "xor %%"REG_a", %%"REG_a"                \n\t"\
380
                ASMALIGN(4)\
381
                "1:                                \n\t"\
382
                "movq (%0, %%"REG_a", 2), %%mm0        \n\t" /*buf0[eax]*/\
383
                "movq (%1, %%"REG_a", 2), %%mm1        \n\t" /*buf1[eax]*/\
384
                "movq (%2, %%"REG_a",2), %%mm2        \n\t" /* uvbuf0[eax]*/\
385
                "movq (%3, %%"REG_a",2), %%mm3        \n\t" /* uvbuf1[eax]*/\
386
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
387
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
                "movq 4096(%2, %%"REG_a",2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
392
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
                "movq 4096(%3, %%"REG_a",2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
395
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
398
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
399
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
400
\
401
\
402
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
404
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
405
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
407
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
409
\
410
\
411
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
412
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
413
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
414
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
415
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
416
                "packuswb %%mm3, %%mm3                \n\t"\
417
\
418
                "packuswb %%mm0, %%mm0                \n\t"\
419
                "paddw %%mm4, %%mm2                \n\t"\
420
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
421
\
422
                "packuswb %%mm1, %%mm1                \n\t"
423
#endif
424

    
425
#define REAL_YSCALEYUV2PACKED(index, c) \
426
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
427
                "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
428
                "psraw $3, %%mm0                \n\t"\
429
                "psraw $3, %%mm1                \n\t"\
430
                "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
431
                "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
432
                "xor "#index", "#index"                \n\t"\
433
                ASMALIGN(4)\
434
                "1:                                \n\t"\
435
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
436
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
437
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
438
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
439
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
442
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
                "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
                "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
449
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
450
                "movq 8(%0, "#index", 2), %%mm6        \n\t" /*buf0[eax]*/\
451
                "movq 8(%1, "#index", 2), %%mm7        \n\t" /*buf1[eax]*/\
452
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
453
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
454
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
                "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
                "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460
                
461
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
462
                
463
#define REAL_YSCALEYUV2RGB(index, c) \
464
                "xor "#index", "#index"        \n\t"\
465
                ASMALIGN(4)\
466
                "1:                                \n\t"\
467
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
468
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
469
                "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
470
                "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
471
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
474
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
481
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
482
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
483
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
484
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
485
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
486
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
488
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
489
                "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
490
                "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
491
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
492
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
493
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
500
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
501
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
502
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
503
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
504
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
505
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
                "paddw %%mm3, %%mm4                \n\t"\
507
                "movq %%mm2, %%mm0                \n\t"\
508
                "movq %%mm5, %%mm6                \n\t"\
509
                "movq %%mm4, %%mm3                \n\t"\
510
                "punpcklwd %%mm2, %%mm2                \n\t"\
511
                "punpcklwd %%mm5, %%mm5                \n\t"\
512
                "punpcklwd %%mm4, %%mm4                \n\t"\
513
                "paddw %%mm1, %%mm2                \n\t"\
514
                "paddw %%mm1, %%mm5                \n\t"\
515
                "paddw %%mm1, %%mm4                \n\t"\
516
                "punpckhwd %%mm0, %%mm0                \n\t"\
517
                "punpckhwd %%mm6, %%mm6                \n\t"\
518
                "punpckhwd %%mm3, %%mm3                \n\t"\
519
                "paddw %%mm7, %%mm0                \n\t"\
520
                "paddw %%mm7, %%mm6                \n\t"\
521
                "paddw %%mm7, %%mm3                \n\t"\
522
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
                "packuswb %%mm0, %%mm2                \n\t"\
524
                "packuswb %%mm6, %%mm5                \n\t"\
525
                "packuswb %%mm3, %%mm4                \n\t"\
526
                "pxor %%mm7, %%mm7                \n\t"
527
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
528
                
529
#define REAL_YSCALEYUV2PACKED1(index, c) \
530
                "xor "#index", "#index"                \n\t"\
531
                ASMALIGN(4)\
532
                "1:                                \n\t"\
533
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
534
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
535
                "psraw $7, %%mm3                \n\t" \
536
                "psraw $7, %%mm4                \n\t" \
537
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
538
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
539
                "psraw $7, %%mm1                \n\t" \
540
                "psraw $7, %%mm7                \n\t" \
541
                
542
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
543
                
544
#define REAL_YSCALEYUV2RGB1(index, c) \
545
                "xor "#index", "#index"        \n\t"\
546
                ASMALIGN(4)\
547
                "1:                                \n\t"\
548
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
549
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
550
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
553
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
554
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
555
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
556
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
557
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
558
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
560
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
561
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
564
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
565
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
566
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
567
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
568
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
569
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
                "paddw %%mm3, %%mm4                \n\t"\
571
                "movq %%mm2, %%mm0                \n\t"\
572
                "movq %%mm5, %%mm6                \n\t"\
573
                "movq %%mm4, %%mm3                \n\t"\
574
                "punpcklwd %%mm2, %%mm2                \n\t"\
575
                "punpcklwd %%mm5, %%mm5                \n\t"\
576
                "punpcklwd %%mm4, %%mm4                \n\t"\
577
                "paddw %%mm1, %%mm2                \n\t"\
578
                "paddw %%mm1, %%mm5                \n\t"\
579
                "paddw %%mm1, %%mm4                \n\t"\
580
                "punpckhwd %%mm0, %%mm0                \n\t"\
581
                "punpckhwd %%mm6, %%mm6                \n\t"\
582
                "punpckhwd %%mm3, %%mm3                \n\t"\
583
                "paddw %%mm7, %%mm0                \n\t"\
584
                "paddw %%mm7, %%mm6                \n\t"\
585
                "paddw %%mm7, %%mm3                \n\t"\
586
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
                "packuswb %%mm0, %%mm2                \n\t"\
588
                "packuswb %%mm6, %%mm5                \n\t"\
589
                "packuswb %%mm3, %%mm4                \n\t"\
590
                "pxor %%mm7, %%mm7                \n\t"
591
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
592

    
593
#define REAL_YSCALEYUV2PACKED1b(index, c) \
594
                "xor "#index", "#index"                \n\t"\
595
                ASMALIGN(4)\
596
                "1:                                \n\t"\
597
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
598
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
599
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
600
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
601
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
                "psrlw $8, %%mm3                \n\t" \
604
                "psrlw $8, %%mm4                \n\t" \
605
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
606
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
607
                "psraw $7, %%mm1                \n\t" \
608
                "psraw $7, %%mm7                \n\t" 
609
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
610
                
611
// do vertical chrominance interpolation
612
#define REAL_YSCALEYUV2RGB1b(index, c) \
613
                "xor "#index", "#index"                \n\t"\
614
                ASMALIGN(4)\
615
                "1:                                \n\t"\
616
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
617
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
618
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
619
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
620
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
623
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
624
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
625
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
626
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
627
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
628
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
629
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
630
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
632
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
633
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
636
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
637
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
638
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
639
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
640
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
641
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
                "paddw %%mm3, %%mm4                \n\t"\
643
                "movq %%mm2, %%mm0                \n\t"\
644
                "movq %%mm5, %%mm6                \n\t"\
645
                "movq %%mm4, %%mm3                \n\t"\
646
                "punpcklwd %%mm2, %%mm2                \n\t"\
647
                "punpcklwd %%mm5, %%mm5                \n\t"\
648
                "punpcklwd %%mm4, %%mm4                \n\t"\
649
                "paddw %%mm1, %%mm2                \n\t"\
650
                "paddw %%mm1, %%mm5                \n\t"\
651
                "paddw %%mm1, %%mm4                \n\t"\
652
                "punpckhwd %%mm0, %%mm0                \n\t"\
653
                "punpckhwd %%mm6, %%mm6                \n\t"\
654
                "punpckhwd %%mm3, %%mm3                \n\t"\
655
                "paddw %%mm7, %%mm0                \n\t"\
656
                "paddw %%mm7, %%mm6                \n\t"\
657
                "paddw %%mm7, %%mm3                \n\t"\
658
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
                "packuswb %%mm0, %%mm2                \n\t"\
660
                "packuswb %%mm6, %%mm5                \n\t"\
661
                "packuswb %%mm3, %%mm4                \n\t"\
662
                "pxor %%mm7, %%mm7                \n\t"
663
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
664

    
665
#define REAL_WRITEBGR32(dst, dstw, index) \
666
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
                        "movq %%mm2, %%mm1                \n\t" /* B */\
668
                        "movq %%mm5, %%mm6                \n\t" /* R */\
669
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
670
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
671
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
672
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
673
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
674
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
675
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
676
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
677
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
678
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
679
\
680
                        MOVNTQ(%%mm0, (dst, index, 4))\
681
                        MOVNTQ(%%mm2, 8(dst, index, 4))\
682
                        MOVNTQ(%%mm1, 16(dst, index, 4))\
683
                        MOVNTQ(%%mm3, 24(dst, index, 4))\
684
\
685
                        "add $8, "#index"                \n\t"\
686
                        "cmp "#dstw", "#index"                \n\t"\
687
                        " jb 1b                                \n\t"
688
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
689

    
690
#define REAL_WRITEBGR16(dst, dstw, index) \
691
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
692
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
693
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
694
                        "psrlq $3, %%mm2                \n\t"\
695
\
696
                        "movq %%mm2, %%mm1                \n\t"\
697
                        "movq %%mm4, %%mm3                \n\t"\
698
\
699
                        "punpcklbw %%mm7, %%mm3                \n\t"\
700
                        "punpcklbw %%mm5, %%mm2                \n\t"\
701
                        "punpckhbw %%mm7, %%mm4                \n\t"\
702
                        "punpckhbw %%mm5, %%mm1                \n\t"\
703
\
704
                        "psllq $3, %%mm3                \n\t"\
705
                        "psllq $3, %%mm4                \n\t"\
706
\
707
                        "por %%mm3, %%mm2                \n\t"\
708
                        "por %%mm4, %%mm1                \n\t"\
709
\
710
                        MOVNTQ(%%mm2, (dst, index, 2))\
711
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
712
\
713
                        "add $8, "#index"                \n\t"\
714
                        "cmp "#dstw", "#index"                \n\t"\
715
                        " jb 1b                                \n\t"
716
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
717

    
718
#define REAL_WRITEBGR15(dst, dstw, index) \
719
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
720
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
721
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
722
                        "psrlq $3, %%mm2                \n\t"\
723
                        "psrlq $1, %%mm5                \n\t"\
724
\
725
                        "movq %%mm2, %%mm1                \n\t"\
726
                        "movq %%mm4, %%mm3                \n\t"\
727
\
728
                        "punpcklbw %%mm7, %%mm3                \n\t"\
729
                        "punpcklbw %%mm5, %%mm2                \n\t"\
730
                        "punpckhbw %%mm7, %%mm4                \n\t"\
731
                        "punpckhbw %%mm5, %%mm1                \n\t"\
732
\
733
                        "psllq $2, %%mm3                \n\t"\
734
                        "psllq $2, %%mm4                \n\t"\
735
\
736
                        "por %%mm3, %%mm2                \n\t"\
737
                        "por %%mm4, %%mm1                \n\t"\
738
\
739
                        MOVNTQ(%%mm2, (dst, index, 2))\
740
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
741
\
742
                        "add $8, "#index"                \n\t"\
743
                        "cmp "#dstw", "#index"                \n\t"\
744
                        " jb 1b                                \n\t"
745
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
746

    
747
#define WRITEBGR24OLD(dst, dstw, index) \
748
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
                        "movq %%mm2, %%mm1                \n\t" /* B */\
750
                        "movq %%mm5, %%mm6                \n\t" /* R */\
751
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
752
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
753
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
754
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
755
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
756
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
757
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
758
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
759
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
760
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
761
\
762
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
763
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
764
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
765
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
766
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
767
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
768
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
769
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
770
\
771
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
772
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
773
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
774
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
775
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
776
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
777
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
778
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
779
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
780
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
781
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
782
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
783
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
784
\
785
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
786
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
787
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
788
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
789
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
790
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
791
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
792
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
793
\
794
                        MOVNTQ(%%mm0, (dst))\
795
                        MOVNTQ(%%mm2, 8(dst))\
796
                        MOVNTQ(%%mm3, 16(dst))\
797
                        "add $24, "#dst"                \n\t"\
798
\
799
                        "add $8, "#index"                \n\t"\
800
                        "cmp "#dstw", "#index"                \n\t"\
801
                        " jb 1b                                \n\t"
802

    
803
#define WRITEBGR24MMX(dst, dstw, index) \
804
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
                        "movq %%mm2, %%mm1                \n\t" /* B */\
806
                        "movq %%mm5, %%mm6                \n\t" /* R */\
807
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
808
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
809
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
810
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
811
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
812
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
813
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
814
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
815
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
816
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
817
\
818
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
819
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
820
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
821
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
822
\
823
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
824
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
825
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
826
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
827
\
828
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
829
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
830
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
831
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
832
\
833
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
834
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
835
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
836
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
837
                        MOVNTQ(%%mm0, (dst))\
838
\
839
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
840
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
841
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
842
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
843
                        MOVNTQ(%%mm6, 8(dst))\
844
\
845
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
846
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
847
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
848
                        MOVNTQ(%%mm5, 16(dst))\
849
\
850
                        "add $24, "#dst"                \n\t"\
851
\
852
                        "add $8, "#index"                        \n\t"\
853
                        "cmp "#dstw", "#index"                        \n\t"\
854
                        " jb 1b                                \n\t"
855

    
856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
859
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
860
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
861
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
862
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
863
\
864
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
865
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
866
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
867
\
868
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
869
                        "por %%mm1, %%mm6                \n\t"\
870
                        "por %%mm3, %%mm6                \n\t"\
871
                        MOVNTQ(%%mm6, (dst))\
872
\
873
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
874
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
875
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
876
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
877
\
878
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
879
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
880
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
881
\
882
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
883
                        "por %%mm3, %%mm6                \n\t"\
884
                        MOVNTQ(%%mm6, 8(dst))\
885
\
886
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
887
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
888
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
889
\
890
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
891
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
892
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
893
\
894
                        "por %%mm1, %%mm3                \n\t"\
895
                        "por %%mm3, %%mm6                \n\t"\
896
                        MOVNTQ(%%mm6, 16(dst))\
897
\
898
                        "add $24, "#dst"                \n\t"\
899
\
900
                        "add $8, "#index"                \n\t"\
901
                        "cmp "#dstw", "#index"                \n\t"\
902
                        " jb 1b                                \n\t"
903

    
904
#ifdef HAVE_MMX2
905
#undef WRITEBGR24
906
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
907
#else
908
#undef WRITEBGR24
909
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
910
#endif
911

    
912
#define REAL_WRITEYUY2(dst, dstw, index) \
913
                        "packuswb %%mm3, %%mm3                \n\t"\
914
                        "packuswb %%mm4, %%mm4                \n\t"\
915
                        "packuswb %%mm7, %%mm1                \n\t"\
916
                        "punpcklbw %%mm4, %%mm3                \n\t"\
917
                        "movq %%mm1, %%mm7                \n\t"\
918
                        "punpcklbw %%mm3, %%mm1                \n\t"\
919
                        "punpckhbw %%mm3, %%mm7                \n\t"\
920
\
921
                        MOVNTQ(%%mm1, (dst, index, 2))\
922
                        MOVNTQ(%%mm7, 8(dst, index, 2))\
923
\
924
                        "add $8, "#index"                \n\t"\
925
                        "cmp "#dstw", "#index"                \n\t"\
926
                        " jb 1b                                \n\t"
927
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
928

    
929

    
930
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
933
{
934
#ifdef HAVE_MMX
935
        if(c->flags & SWS_ACCURATE_RND){
936
                if(uDest){
937
                        YSCALEYUV2YV12X_ACCURATE(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
                        YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939
                }
940

    
941
                YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942
        }else{
943
                if(uDest){
944
                        YSCALEYUV2YV12X(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
                        YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946
                }
947

    
948
                YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
949
        }
950
#else
951
#ifdef HAVE_ALTIVEC
952
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953
                      chrFilter, chrSrc, chrFilterSize,
954
                      dest, uDest, vDest, dstW, chrDstW);
955
#else //HAVE_ALTIVEC
956
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957
            chrFilter, chrSrc, chrFilterSize,
958
            dest, uDest, vDest, dstW, chrDstW);
959
#endif //!HAVE_ALTIVEC
960
#endif
961
}
962

    
963
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
966
{
967
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968
             chrFilter, chrSrc, chrFilterSize,
969
             dest, uDest, dstW, chrDstW, dstFormat);
970
}
971

    
972
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
974
{
975
#ifdef HAVE_MMX
976
        if(uDest != NULL)
977
        {
978
                asm volatile(
979
                                YSCALEYUV2YV121
980
                                :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981
                                "g" (-chrDstW)
982
                                : "%"REG_a
983
                        );
984

    
985
                asm volatile(
986
                                YSCALEYUV2YV121
987
                                :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
988
                                "g" (-chrDstW)
989
                                : "%"REG_a
990
                        );
991
        }
992

    
993
        asm volatile(
994
                YSCALEYUV2YV121
995
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
996
                "g" (-dstW)
997
                : "%"REG_a
998
        );
999
#else
1000
        int i;
1001
        for(i=0; i<dstW; i++)
1002
        {
1003
                int val= lumSrc[i]>>7;
1004
                
1005
                if(val&256){
1006
                        if(val<0) val=0;
1007
                        else      val=255;
1008
                }
1009

    
1010
                dest[i]= val;
1011
        }
1012

    
1013
        if(uDest != NULL)
1014
                for(i=0; i<chrDstW; i++)
1015
                {
1016
                        int u=chrSrc[i]>>7;
1017
                        int v=chrSrc[i + 2048]>>7;
1018

    
1019
                        if((u|v)&256){
1020
                                if(u<0)         u=0;
1021
                                else if (u>255) u=255;
1022
                                if(v<0)         v=0;
1023
                                else if (v>255) v=255;
1024
                        }
1025

    
1026
                        uDest[i]= u;
1027
                        vDest[i]= v;
1028
                }
1029
#endif
1030
}
1031

    
1032

    
1033
/**
1034
 * vertical scale YV12 to RGB
1035
 */
1036
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038
                            uint8_t *dest, long dstW, long dstY)
1039
{
1040
#ifdef HAVE_MMX
1041
    long dummy=0;
1042
    if(c->flags & SWS_ACCURATE_RND){
1043
                switch(c->dstFormat){
1044
                case PIX_FMT_RGB32:
1045
                                YSCALEYUV2PACKEDX_ACCURATE
1046
                                YSCALEYUV2RGBX
1047
                                WRITEBGR32(%4, %5, %%REGa)
1048

    
1049
                                YSCALEYUV2PACKEDX_END
1050
                        return;
1051
                case PIX_FMT_BGR24:
1052
                                YSCALEYUV2PACKEDX_ACCURATE
1053
                                YSCALEYUV2RGBX
1054
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055
                                "add %4, %%"REG_c"                        \n\t"
1056
                                WRITEBGR24(%%REGc, %5, %%REGa)
1057

    
1058

    
1059
                        :: "r" (&c->redDither), 
1060
                           "m" (dummy), "m" (dummy), "m" (dummy),
1061
                           "r" (dest), "m" (dstW)
1062
                        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063
                        );
1064
                        return;
1065
                case PIX_FMT_BGR555:
1066
                                YSCALEYUV2PACKEDX_ACCURATE
1067
                                YSCALEYUV2RGBX
1068
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069
#ifdef DITHER1XBPP
1070
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073
#endif
1074

    
1075
                                WRITEBGR15(%4, %5, %%REGa)
1076
                                YSCALEYUV2PACKEDX_END
1077
                        return;
1078
                case PIX_FMT_BGR565:
1079
                                YSCALEYUV2PACKEDX_ACCURATE
1080
                                YSCALEYUV2RGBX
1081
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082
#ifdef DITHER1XBPP
1083
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086
#endif
1087

    
1088
                                WRITEBGR16(%4, %5, %%REGa)
1089
                                YSCALEYUV2PACKEDX_END
1090
                        return;
1091
                case PIX_FMT_YUYV422:
1092
                                YSCALEYUV2PACKEDX_ACCURATE
1093
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094

    
1095
                                "psraw $3, %%mm3                \n\t"
1096
                                "psraw $3, %%mm4                \n\t"
1097
                                "psraw $3, %%mm1                \n\t"
1098
                                "psraw $3, %%mm7                \n\t"
1099
                                WRITEYUY2(%4, %5, %%REGa)
1100
                                YSCALEYUV2PACKEDX_END
1101
                        return;
1102
                }
1103
    }else{
1104
        switch(c->dstFormat)
1105
        {
1106
        case PIX_FMT_RGB32:
1107
                                YSCALEYUV2PACKEDX
1108
                                YSCALEYUV2RGBX
1109
                                WRITEBGR32(%4, %5, %%REGa)
1110
                                YSCALEYUV2PACKEDX_END
1111
                return;
1112
        case PIX_FMT_BGR24:
1113
                                YSCALEYUV2PACKEDX
1114
                                YSCALEYUV2RGBX
1115
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1116
                                "add %4, %%"REG_c"                        \n\t"
1117
                                WRITEBGR24(%%REGc, %5, %%REGa)
1118

    
1119
                        :: "r" (&c->redDither), 
1120
                           "m" (dummy), "m" (dummy), "m" (dummy),
1121
                           "r" (dest), "m" (dstW)
1122
                        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123
                        );
1124
                return;
1125
        case PIX_FMT_BGR555:
1126
                                YSCALEYUV2PACKEDX
1127
                                YSCALEYUV2RGBX
1128
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129
#ifdef DITHER1XBPP
1130
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1131
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1132
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1133
#endif
1134

    
1135
                                WRITEBGR15(%4, %5, %%REGa)
1136
                                YSCALEYUV2PACKEDX_END
1137
                return;
1138
        case PIX_FMT_BGR565:
1139
                                YSCALEYUV2PACKEDX
1140
                                YSCALEYUV2RGBX
1141
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142
#ifdef DITHER1XBPP
1143
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1144
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1145
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1146
#endif
1147

    
1148
                                WRITEBGR16(%4, %5, %%REGa)
1149
                                YSCALEYUV2PACKEDX_END
1150
                return;
1151
        case PIX_FMT_YUYV422:
1152
                                YSCALEYUV2PACKEDX
1153
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154

    
1155
                                "psraw $3, %%mm3                \n\t"
1156
                                "psraw $3, %%mm4                \n\t"
1157
                                "psraw $3, %%mm1                \n\t"
1158
                                "psraw $3, %%mm7                \n\t"
1159
                                WRITEYUY2(%4, %5, %%REGa)
1160
                                YSCALEYUV2PACKEDX_END
1161
                return;
1162
        }
1163
    }
1164
#endif
1165
#ifdef HAVE_ALTIVEC
1166
                /* The following list of supported dstFormat values should
1167
                   match what's found in the body of altivec_yuv2packedX() */
1168
                if(c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1169
                   c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170
                   c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1171
                        altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172
                                    chrFilter, chrSrc, chrFilterSize,
1173
                                    dest, dstW, dstY);
1174
                else
1175
#endif
1176
                        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177
                                    chrFilter, chrSrc, chrFilterSize,
1178
                                    dest, dstW, dstY);
1179
}
1180

    
1181
/**
1182
 * vertical bilinear scale YV12 to RGB
1183
 */
1184
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1186
{
1187
        int yalpha1=yalpha^4095;
1188
        int uvalpha1=uvalpha^4095;
1189
        int i;
1190

    
1191
#if 0 //isn't used
1192
        if(flags&SWS_FULL_CHR_H_INT)
1193
        {
1194
                switch(dstFormat)
1195
                {
1196
#ifdef HAVE_MMX
1197
                case PIX_FMT_RGB32:
1198
                        asm volatile(
1199

1200

1201
FULL_YSCALEYUV2RGB
1202
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
1203
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
1204

1205
                        "movq %%mm3, %%mm1                \n\t"
1206
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
1207
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1208

1209
                        MOVNTQ(%%mm3, (%4, %%REGa, 4))
1210
                        MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1211

1212
                        "add $4, %%"REG_a"                \n\t"
1213
                        "cmp %5, %%"REG_a"                \n\t"
1214
                        " jb 1b                                \n\t"
1215

1216

1217
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1218
                        "m" (yalpha1), "m" (uvalpha1)
1219
                        : "%"REG_a
1220
                        );
1221
                        break;
1222
                case PIX_FMT_BGR24:
1223
                        asm volatile(
1224

1225
FULL_YSCALEYUV2RGB
1226

1227
                                                                // lsb ... msb
1228
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
1229
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
1230

1231
                        "movq %%mm3, %%mm1                \n\t"
1232
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
1233
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1234

1235
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
1236
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
1237
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1238
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1239
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
1240
                        "movq %%mm1, %%mm2                \n\t"
1241
                        "psllq $48, %%mm1                \n\t" // 000000BG
1242
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
1243

1244
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
1245
                        "psrld $16, %%mm2                \n\t" // R000R000
1246
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
1247
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
1248

1249
                        "mov %4, %%"REG_b"                \n\t"
1250
                        "add %%"REG_a", %%"REG_b"        \n\t"
1251

1252
#ifdef HAVE_MMX2
1253
                        //FIXME Alignment
1254
                        "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1255
                        "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1256
#else
1257
                        "movd %%mm3, (%%"REG_b", %%"REG_a", 2)        \n\t"
1258
                        "psrlq $32, %%mm3                \n\t"
1259
                        "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)        \n\t"
1260
                        "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)        \n\t"
1261
#endif
1262
                        "add $4, %%"REG_a"                \n\t"
1263
                        "cmp %5, %%"REG_a"                \n\t"
1264
                        " jb 1b                                \n\t"
1265

    
1266
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1267
                        "m" (yalpha1), "m" (uvalpha1)
1268
                        : "%"REG_a, "%"REG_b
1269
                        );
1270
                        break;
1271
                case PIX_FMT_BGR555:
1272
                        asm volatile(
1273

    
1274
FULL_YSCALEYUV2RGB
1275
#ifdef DITHER1XBPP
1276
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1277
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1278
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1279
#endif
1280
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1281
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1282
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1283

    
1284
                        "psrlw $3, %%mm3                \n\t"
1285
                        "psllw $2, %%mm1                \n\t"
1286
                        "psllw $7, %%mm0                \n\t"
1287
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
1288
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
1289

    
1290
                        "por %%mm3, %%mm1                \n\t"
1291
                        "por %%mm1, %%mm0                \n\t"
1292

    
1293
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1294

    
1295
                        "add $4, %%"REG_a"                \n\t"
1296
                        "cmp %5, %%"REG_a"                \n\t"
1297
                        " jb 1b                                \n\t"
1298

    
1299
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1300
                        "m" (yalpha1), "m" (uvalpha1)
1301
                        : "%"REG_a
1302
                        );
1303
                        break;
1304
                case PIX_FMT_BGR565:
1305
                        asm volatile(
1306

    
1307
FULL_YSCALEYUV2RGB
1308
#ifdef DITHER1XBPP
1309
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1310
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1311
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1312
#endif
1313
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1314
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1315
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1316

    
1317
                        "psrlw $3, %%mm3                \n\t"
1318
                        "psllw $3, %%mm1                \n\t"
1319
                        "psllw $8, %%mm0                \n\t"
1320
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
1321
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
1322

    
1323
                        "por %%mm3, %%mm1                \n\t"
1324
                        "por %%mm1, %%mm0                \n\t"
1325

    
1326
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1327

    
1328
                        "add $4, %%"REG_a"                \n\t"
1329
                        "cmp %5, %%"REG_a"                \n\t"
1330
                        " jb 1b                                \n\t"
1331

    
1332
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1333
                        "m" (yalpha1), "m" (uvalpha1)
1334
                        : "%"REG_a
1335
                        );
1336
                break;
1337
#endif
1338
                case PIX_FMT_BGR32:
1339
#ifndef HAVE_MMX
1340
                case PIX_FMT_RGB32:
1341
#endif
1342
                if(dstFormat==PIX_FMT_RGB32)
1343
                {
1344
                        int i;
1345
#ifdef WORDS_BIGENDIAN
1346
                        dest++;
1347
#endif
1348
                        for(i=0;i<dstW;i++){
1349
                                // vertical linear interpolation && yuv2rgb in a single step:
1350
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1351
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1352
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1353
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1354
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1355
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1356
                                dest+= 4;
1357
                        }
1358
                }
1359
                else if(dstFormat==PIX_FMT_BGR24)
1360
                {
1361
                        int i;
1362
                        for(i=0;i<dstW;i++){
1363
                                // vertical linear interpolation && yuv2rgb in a single step:
1364
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1365
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1366
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1367
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1368
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1369
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1370
                                dest+= 3;
1371
                        }
1372
                }
1373
                else if(dstFormat==PIX_FMT_BGR565)
1374
                {
1375
                        int i;
1376
                        for(i=0;i<dstW;i++){
1377
                                // vertical linear interpolation && yuv2rgb in a single step:
1378
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1379
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1380
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1381

    
1382
                                ((uint16_t*)dest)[i] =
1383
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1384
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1385
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1386
                        }
1387
                }
1388
                else if(dstFormat==PIX_FMT_BGR555)
1389
                {
1390
                        int i;
1391
                        for(i=0;i<dstW;i++){
1392
                                // vertical linear interpolation && yuv2rgb in a single step:
1393
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1394
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1395
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1396

    
1397
                                ((uint16_t*)dest)[i] =
1398
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1399
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1400
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1401
                        }
1402
                }
1403
        }//FULL_UV_IPOL
1404
        else
1405
        {
1406
#endif // if 0
1407
#ifdef HAVE_MMX
1408
        switch(c->dstFormat)
1409
        {
1410
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1411
        case PIX_FMT_RGB32:
1412
                        asm volatile(
1413
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1414
                                "mov %4, %%"REG_b"                        \n\t"
1415
                                "push %%"REG_BP"                        \n\t"
1416
                                YSCALEYUV2RGB(%%REGBP, %5)
1417
                                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1418
                                "pop %%"REG_BP"                         \n\t"
1419
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1420

    
1421
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1422
                        "a" (&c->redDither)
1423
                        );
1424
                        return;
1425
        case PIX_FMT_BGR24:
1426
                        asm volatile(
1427
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1428
                                "mov %4, %%"REG_b"                        \n\t"
1429
                                "push %%"REG_BP"                        \n\t"
1430
                                YSCALEYUV2RGB(%%REGBP, %5)
1431
                                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1432
                                "pop %%"REG_BP"                         \n\t"
1433
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1434
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435
                        "a" (&c->redDither)
1436
                        );
1437
                        return;
1438
        case PIX_FMT_BGR555:
1439
                        asm volatile(
1440
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1441
                                "mov %4, %%"REG_b"                        \n\t"
1442
                                "push %%"REG_BP"                        \n\t"
1443
                                YSCALEYUV2RGB(%%REGBP, %5)
1444
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1445
#ifdef DITHER1XBPP
1446
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1447
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1448
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1449
#endif
1450

    
1451
                                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1452
                                "pop %%"REG_BP"                         \n\t"
1453
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1454

    
1455
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1456
                        "a" (&c->redDither)
1457
                        );
1458
                        return;
1459
        case PIX_FMT_BGR565:
1460
                        asm volatile(
1461
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1462
                                "mov %4, %%"REG_b"                        \n\t"
1463
                                "push %%"REG_BP"                        \n\t"
1464
                                YSCALEYUV2RGB(%%REGBP, %5)
1465
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1466
#ifdef DITHER1XBPP
1467
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1468
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1469
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1470
#endif
1471

    
1472
                                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1473
                                "pop %%"REG_BP"                         \n\t"
1474
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1475
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1476
                        "a" (&c->redDither)
1477
                        );
1478
                        return;
1479
        case PIX_FMT_YUYV422:
1480
                        asm volatile(
1481
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1482
                                "mov %4, %%"REG_b"                        \n\t"
1483
                                "push %%"REG_BP"                        \n\t"
1484
                                YSCALEYUV2PACKED(%%REGBP, %5)
1485
                                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1486
                                "pop %%"REG_BP"                         \n\t"
1487
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1488
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1489
                        "a" (&c->redDither)
1490
                        );
1491
                        return;
1492
        default: break;
1493
        }
1494
#endif //HAVE_MMX
1495
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1496
}
1497

    
1498
/**
1499
 * YV12 to RGB without scaling or interpolating
1500
 */
1501
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1502
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1503
{
1504
        const int yalpha1=0;
1505
        int i;
1506
        
1507
        uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1508
        const int yalpha= 4096; //FIXME ...
1509

    
1510
        if(flags&SWS_FULL_CHR_H_INT)
1511
        {
1512
                RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1513
                return;
1514
        }
1515

    
1516
#ifdef HAVE_MMX
1517
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1518
        {
1519
                switch(dstFormat)
1520
                {
1521
                case PIX_FMT_RGB32:
1522
                        asm volatile(
1523
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1524
                                "mov %4, %%"REG_b"                        \n\t"
1525
                                "push %%"REG_BP"                        \n\t"
1526
                                YSCALEYUV2RGB1(%%REGBP, %5)
1527
                                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1528
                                "pop %%"REG_BP"                         \n\t"
1529
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1530

    
1531
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1532
                        "a" (&c->redDither)
1533
                        );
1534
                        return;
1535
                case PIX_FMT_BGR24:
1536
                        asm volatile(
1537
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1538
                                "mov %4, %%"REG_b"                        \n\t"
1539
                                "push %%"REG_BP"                        \n\t"
1540
                                YSCALEYUV2RGB1(%%REGBP, %5)
1541
                                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1542
                                "pop %%"REG_BP"                         \n\t"
1543
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1544

    
1545
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1546
                        "a" (&c->redDither)
1547
                        );
1548
                        return;
1549
                case PIX_FMT_BGR555:
1550
                        asm volatile(
1551
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1552
                                "mov %4, %%"REG_b"                        \n\t"
1553
                                "push %%"REG_BP"                        \n\t"
1554
                                YSCALEYUV2RGB1(%%REGBP, %5)
1555
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556
#ifdef DITHER1XBPP
1557
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1558
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1559
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1560
#endif
1561
                                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1562
                                "pop %%"REG_BP"                         \n\t"
1563
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1564

    
1565
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1566
                        "a" (&c->redDither)
1567
                        );
1568
                        return;
1569
                case PIX_FMT_BGR565:
1570
                        asm volatile(
1571
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1572
                                "mov %4, %%"REG_b"                        \n\t"
1573
                                "push %%"REG_BP"                        \n\t"
1574
                                YSCALEYUV2RGB1(%%REGBP, %5)
1575
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1576
#ifdef DITHER1XBPP
1577
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1578
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1579
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1580
#endif
1581

    
1582
                                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1583
                                "pop %%"REG_BP"                         \n\t"
1584
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1585

    
1586
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587
                        "a" (&c->redDither)
1588
                        );
1589
                        return;
1590
                case PIX_FMT_YUYV422:
1591
                        asm volatile(
1592
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1593
                                "mov %4, %%"REG_b"                        \n\t"
1594
                                "push %%"REG_BP"                        \n\t"
1595
                                YSCALEYUV2PACKED1(%%REGBP, %5)
1596
                                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1597
                                "pop %%"REG_BP"                         \n\t"
1598
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1599

    
1600
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1601
                        "a" (&c->redDither)
1602
                        );
1603
                        return;
1604
                }
1605
        }
1606
        else
1607
        {
1608
                switch(dstFormat)
1609
                {
1610
                case PIX_FMT_RGB32:
1611
                        asm volatile(
1612
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1613
                                "mov %4, %%"REG_b"                        \n\t"
1614
                                "push %%"REG_BP"                        \n\t"
1615
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1616
                                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1617
                                "pop %%"REG_BP"                         \n\t"
1618
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1619

    
1620
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1621
                        "a" (&c->redDither)
1622
                        );
1623
                        return;
1624
                case PIX_FMT_BGR24:
1625
                        asm volatile(
1626
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1627
                                "mov %4, %%"REG_b"                        \n\t"
1628
                                "push %%"REG_BP"                        \n\t"
1629
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1630
                                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1631
                                "pop %%"REG_BP"                         \n\t"
1632
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1633

    
1634
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1635
                        "a" (&c->redDither)
1636
                        );
1637
                        return;
1638
                case PIX_FMT_BGR555:
1639
                        asm volatile(
1640
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1641
                                "mov %4, %%"REG_b"                        \n\t"
1642
                                "push %%"REG_BP"                        \n\t"
1643
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1644
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1645
#ifdef DITHER1XBPP
1646
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1647
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1648
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1649
#endif
1650
                                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1651
                                "pop %%"REG_BP"                         \n\t"
1652
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1653

    
1654
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1655
                        "a" (&c->redDither)
1656
                        );
1657
                        return;
1658
                case PIX_FMT_BGR565:
1659
                        asm volatile(
1660
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1661
                                "mov %4, %%"REG_b"                        \n\t"
1662
                                "push %%"REG_BP"                        \n\t"
1663
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1664
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1665
#ifdef DITHER1XBPP
1666
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1667
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1668
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1669
#endif
1670

    
1671
                                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1672
                                "pop %%"REG_BP"                         \n\t"
1673
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1674

    
1675
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1676
                        "a" (&c->redDither)
1677
                        );
1678
                        return;
1679
                case PIX_FMT_YUYV422:
1680
                        asm volatile(
1681
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1682
                                "mov %4, %%"REG_b"                        \n\t"
1683
                                "push %%"REG_BP"                        \n\t"
1684
                                YSCALEYUV2PACKED1b(%%REGBP, %5)
1685
                                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1686
                                "pop %%"REG_BP"                         \n\t"
1687
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1688

    
1689
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1690
                        "a" (&c->redDither)
1691
                        );
1692
                        return;
1693
                }
1694
        }
1695
#endif
1696
        if( uvalpha < 2048 )
1697
        {
1698
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1699
        }else{
1700
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1701
        }
1702
}
1703

    
1704
//FIXME yuy2* can read upto 7 samples to much
1705

    
1706
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1707
{
1708
#ifdef HAVE_MMX
1709
        asm volatile(
1710
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1711
                "mov %0, %%"REG_a"                \n\t"
1712
                "1:                                \n\t"
1713
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1714
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1715
                "pand %%mm2, %%mm0                \n\t"
1716
                "pand %%mm2, %%mm1                \n\t"
1717
                "packuswb %%mm1, %%mm0                \n\t"
1718
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1719
                "add $8, %%"REG_a"                \n\t"
1720
                " js 1b                                \n\t"
1721
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1722
                : "%"REG_a
1723
        );
1724
#else
1725
        int i;
1726
        for(i=0; i<width; i++)
1727
                dst[i]= src[2*i];
1728
#endif
1729
}
1730

    
1731
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1732
{
1733
#ifdef HAVE_MMX
1734
        asm volatile(
1735
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1736
                "mov %0, %%"REG_a"                \n\t"
1737
                "1:                                \n\t"
1738
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1739
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1740
                "psrlw $8, %%mm0                \n\t"
1741
                "psrlw $8, %%mm1                \n\t"
1742
                "packuswb %%mm1, %%mm0                \n\t"
1743
                "movq %%mm0, %%mm1                \n\t"
1744
                "psrlw $8, %%mm0                \n\t"
1745
                "pand %%mm4, %%mm1                \n\t"
1746
                "packuswb %%mm0, %%mm0                \n\t"
1747
                "packuswb %%mm1, %%mm1                \n\t"
1748
                "movd %%mm0, (%3, %%"REG_a")        \n\t"
1749
                "movd %%mm1, (%2, %%"REG_a")        \n\t"
1750
                "add $4, %%"REG_a"                \n\t"
1751
                " js 1b                                \n\t"
1752
                : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1753
                : "%"REG_a
1754
        );
1755
#else
1756
        int i;
1757
        for(i=0; i<width; i++)
1758
        {
1759
                dstU[i]= src1[4*i + 1];
1760
                dstV[i]= src1[4*i + 3];
1761
        }
1762
#endif
1763
        assert(src1 == src2);
1764
}
1765

    
1766
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1767
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1768
{
1769
#ifdef HAVE_MMX
1770
        asm volatile(
1771
                "mov %0, %%"REG_a"                \n\t"
1772
                "1:                                \n\t"
1773
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1774
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1775
                "psrlw $8, %%mm0                \n\t"
1776
                "psrlw $8, %%mm1                \n\t"
1777
                "packuswb %%mm1, %%mm0                \n\t"
1778
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1779
                "add $8, %%"REG_a"                \n\t"
1780
                " js 1b                                \n\t"
1781
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1782
                : "%"REG_a
1783
        );
1784
#else
1785
        int i;
1786
        for(i=0; i<width; i++)
1787
                dst[i]= src[2*i+1];
1788
#endif
1789
}
1790

    
1791
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1792
{
1793
#ifdef HAVE_MMX
1794
        asm volatile(
1795
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1796
                "mov %0, %%"REG_a"                \n\t"
1797
                "1:                                \n\t"
1798
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1799
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1800
                "pand %%mm4, %%mm0                \n\t"
1801
                "pand %%mm4, %%mm1                \n\t"
1802
                "packuswb %%mm1, %%mm0                \n\t"
1803
                "movq %%mm0, %%mm1                \n\t"
1804
                "psrlw $8, %%mm0                \n\t"
1805
                "pand %%mm4, %%mm1                \n\t"
1806
                "packuswb %%mm0, %%mm0                \n\t"
1807
                "packuswb %%mm1, %%mm1                \n\t"
1808
                "movd %%mm0, (%3, %%"REG_a")        \n\t"
1809
                "movd %%mm1, (%2, %%"REG_a")        \n\t"
1810
                "add $4, %%"REG_a"                \n\t"
1811
                " js 1b                                \n\t"
1812
                : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1813
                : "%"REG_a
1814
        );
1815
#else
1816
        int i;
1817
        for(i=0; i<width; i++)
1818
        {
1819
                dstU[i]= src1[4*i + 0];
1820
                dstV[i]= src1[4*i + 2];
1821
        }
1822
#endif
1823
        assert(src1 == src2);
1824
}
1825

    
1826
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1827
{
1828
        int i;
1829
        for(i=0; i<width; i++)
1830
        {
1831
                int b=  ((uint32_t*)src)[i]&0xFF;
1832
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1833
                int r= (((uint32_t*)src)[i]>>16)&0xFF;
1834

    
1835
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1836
        }
1837
}
1838

    
1839
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1840
{
1841
        int i;
1842
        assert(src1 == src2);
1843
        for(i=0; i<width; i++)
1844
        {
1845
                const int a= ((uint32_t*)src1)[2*i+0];
1846
                const int e= ((uint32_t*)src1)[2*i+1];
1847
                const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848
                const int h= (a&0x00FF00) + (e&0x00FF00);
1849
                 const int b=  l&0x3FF;
1850
                const int g=  h>>8;
1851
                const int r=  l>>16;
1852

    
1853
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1855
        }
1856
}
1857

    
1858
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1859
{
1860
#ifdef HAVE_MMX
1861
        asm volatile(
1862
                "mov %2, %%"REG_a"                \n\t"
1863
                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
1864
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1865
                "pxor %%mm7, %%mm7                \n\t"
1866
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
1867
                ASMALIGN(4)
1868
                "1:                                \n\t"
1869
                PREFETCH" 64(%0, %%"REG_d")        \n\t"
1870
                "movd (%0, %%"REG_d"), %%mm0        \n\t"
1871
                "movd 3(%0, %%"REG_d"), %%mm1        \n\t"
1872
                "punpcklbw %%mm7, %%mm0                \n\t"
1873
                "punpcklbw %%mm7, %%mm1                \n\t"
1874
                "movd 6(%0, %%"REG_d"), %%mm2        \n\t"
1875
                "movd 9(%0, %%"REG_d"), %%mm3        \n\t"
1876
                "punpcklbw %%mm7, %%mm2                \n\t"
1877
                "punpcklbw %%mm7, %%mm3                \n\t"
1878
                "pmaddwd %%mm6, %%mm0                \n\t"
1879
                "pmaddwd %%mm6, %%mm1                \n\t"
1880
                "pmaddwd %%mm6, %%mm2                \n\t"
1881
                "pmaddwd %%mm6, %%mm3                \n\t"
1882
#ifndef FAST_BGR2YV12
1883
                "psrad $8, %%mm0                \n\t"
1884
                "psrad $8, %%mm1                \n\t"
1885
                "psrad $8, %%mm2                \n\t"
1886
                "psrad $8, %%mm3                \n\t"
1887
#endif
1888
                "packssdw %%mm1, %%mm0                \n\t"
1889
                "packssdw %%mm3, %%mm2                \n\t"
1890
                "pmaddwd %%mm5, %%mm0                \n\t"
1891
                "pmaddwd %%mm5, %%mm2                \n\t"
1892
                "packssdw %%mm2, %%mm0                \n\t"
1893
                "psraw $7, %%mm0                \n\t"
1894

    
1895
                "movd 12(%0, %%"REG_d"), %%mm4        \n\t"
1896
                "movd 15(%0, %%"REG_d"), %%mm1        \n\t"
1897
                "punpcklbw %%mm7, %%mm4                \n\t"
1898
                "punpcklbw %%mm7, %%mm1                \n\t"
1899
                "movd 18(%0, %%"REG_d"), %%mm2        \n\t"
1900
                "movd 21(%0, %%"REG_d"), %%mm3        \n\t"
1901
                "punpcklbw %%mm7, %%mm2                \n\t"
1902
                "punpcklbw %%mm7, %%mm3                \n\t"
1903
                "pmaddwd %%mm6, %%mm4                \n\t"
1904
                "pmaddwd %%mm6, %%mm1                \n\t"
1905
                "pmaddwd %%mm6, %%mm2                \n\t"
1906
                "pmaddwd %%mm6, %%mm3                \n\t"
1907
#ifndef FAST_BGR2YV12
1908
                "psrad $8, %%mm4                \n\t"
1909
                "psrad $8, %%mm1                \n\t"
1910
                "psrad $8, %%mm2                \n\t"
1911
                "psrad $8, %%mm3                \n\t"
1912
#endif
1913
                "packssdw %%mm1, %%mm4                \n\t"
1914
                "packssdw %%mm3, %%mm2                \n\t"
1915
                "pmaddwd %%mm5, %%mm4                \n\t"
1916
                "pmaddwd %%mm5, %%mm2                \n\t"
1917
                "add $24, %%"REG_d"                \n\t"
1918
                "packssdw %%mm2, %%mm4                \n\t"
1919
                "psraw $7, %%mm4                \n\t"
1920

    
1921
                "packuswb %%mm4, %%mm0                \n\t"
1922
                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
1923

    
1924
                "movq %%mm0, (%1, %%"REG_a")        \n\t"
1925
                "add $8, %%"REG_a"                \n\t"
1926
                " js 1b                                \n\t"
1927
                : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1928
                : "%"REG_a, "%"REG_d
1929
        );
1930
#else
1931
        int i;
1932
        for(i=0; i<width; i++)
1933
        {
1934
                int b= src[i*3+0];
1935
                int g= src[i*3+1];
1936
                int r= src[i*3+2];
1937

    
1938
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1939
        }
1940
#endif
1941
}
1942

    
1943
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1944
{
1945
#ifdef HAVE_MMX
1946
        asm volatile(
1947
                "mov %3, %%"REG_a"                \n\t"
1948
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1949
                "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
1950
                "pxor %%mm7, %%mm7                \n\t"
1951
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"        \n\t"
1952
                "add %%"REG_d", %%"REG_d"        \n\t"
1953
                ASMALIGN(4)
1954
                "1:                                \n\t"
1955
                PREFETCH" 64(%0, %%"REG_d")        \n\t"
1956
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1957
                "movq (%0, %%"REG_d"), %%mm0        \n\t"
1958
                "movq 6(%0, %%"REG_d"), %%mm2        \n\t"
1959
                "movq %%mm0, %%mm1                \n\t"
1960
                "movq %%mm2, %%mm3                \n\t"
1961
                "psrlq $24, %%mm0                \n\t"
1962
                "psrlq $24, %%mm2                \n\t"
1963
                PAVGB(%%mm1, %%mm0)
1964
                PAVGB(%%mm3, %%mm2)
1965
                "punpcklbw %%mm7, %%mm0                \n\t"
1966
                "punpcklbw %%mm7, %%mm2                \n\t"
1967
#else
1968
                "movd (%0, %%"REG_d"), %%mm0        \n\t"
1969
                "movd 3(%0, %%"REG_d"), %%mm2        \n\t"
1970
                "punpcklbw %%mm7, %%mm0                \n\t"
1971
                "punpcklbw %%mm7, %%mm2                \n\t"
1972
                "paddw %%mm2, %%mm0                \n\t"
1973
                "movd 6(%0, %%"REG_d"), %%mm4        \n\t"
1974
                "movd 9(%0, %%"REG_d"), %%mm2        \n\t"
1975
                "punpcklbw %%mm7, %%mm4                \n\t"
1976
                "punpcklbw %%mm7, %%mm2                \n\t"
1977
                "paddw %%mm4, %%mm2                \n\t"
1978
                "psrlw $1, %%mm0                \n\t"
1979
                "psrlw $1, %%mm2                \n\t"
1980
#endif
1981
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
1982
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
1983
                
1984
                "pmaddwd %%mm0, %%mm1                \n\t"
1985
                "pmaddwd %%mm2, %%mm3                \n\t"
1986
                "pmaddwd %%mm6, %%mm0                \n\t"
1987
                "pmaddwd %%mm6, %%mm2                \n\t"
1988
#ifndef FAST_BGR2YV12
1989
                "psrad $8, %%mm0                \n\t"
1990
                "psrad $8, %%mm1                \n\t"
1991
                "psrad $8, %%mm2                \n\t"
1992
                "psrad $8, %%mm3                \n\t"
1993
#endif
1994
                "packssdw %%mm2, %%mm0                \n\t"
1995
                "packssdw %%mm3, %%mm1                \n\t"
1996
                "pmaddwd %%mm5, %%mm0                \n\t"
1997
                "pmaddwd %%mm5, %%mm1                \n\t"
1998
                "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
1999
                "psraw $7, %%mm0                \n\t"
2000

    
2001
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2002
                "movq 12(%0, %%"REG_d"), %%mm4        \n\t"
2003
                "movq 18(%0, %%"REG_d"), %%mm2        \n\t"
2004
                "movq %%mm4, %%mm1                \n\t"
2005
                "movq %%mm2, %%mm3                \n\t"
2006
                "psrlq $24, %%mm4                \n\t"
2007
                "psrlq $24, %%mm2                \n\t"
2008
                PAVGB(%%mm1, %%mm4)
2009
                PAVGB(%%mm3, %%mm2)
2010
                "punpcklbw %%mm7, %%mm4                \n\t"
2011
                "punpcklbw %%mm7, %%mm2                \n\t"
2012
#else
2013
                "movd 12(%0, %%"REG_d"), %%mm4        \n\t"
2014
                "movd 15(%0, %%"REG_d"), %%mm2        \n\t"
2015
                "punpcklbw %%mm7, %%mm4                \n\t"
2016
                "punpcklbw %%mm7, %%mm2                \n\t"
2017
                "paddw %%mm2, %%mm4                \n\t"
2018
                "movd 18(%0, %%"REG_d"), %%mm5        \n\t"
2019
                "movd 21(%0, %%"REG_d"), %%mm2        \n\t"
2020
                "punpcklbw %%mm7, %%mm5                \n\t"
2021
                "punpcklbw %%mm7, %%mm2                \n\t"
2022
                "paddw %%mm5, %%mm2                \n\t"
2023
                "movq "MANGLE(w1111)", %%mm5                \n\t"
2024
                "psrlw $2, %%mm4                \n\t"
2025
                "psrlw $2, %%mm2                \n\t"
2026
#endif
2027
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2028
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2029
                
2030
                "pmaddwd %%mm4, %%mm1                \n\t"
2031
                "pmaddwd %%mm2, %%mm3                \n\t"
2032
                "pmaddwd %%mm6, %%mm4                \n\t"
2033
                "pmaddwd %%mm6, %%mm2                \n\t"
2034
#ifndef FAST_BGR2YV12
2035
                "psrad $8, %%mm4                \n\t"
2036
                "psrad $8, %%mm1                \n\t"
2037
                "psrad $8, %%mm2                \n\t"
2038
                "psrad $8, %%mm3                \n\t"
2039
#endif
2040
                "packssdw %%mm2, %%mm4                \n\t"
2041
                "packssdw %%mm3, %%mm1                \n\t"
2042
                "pmaddwd %%mm5, %%mm4                \n\t"
2043
                "pmaddwd %%mm5, %%mm1                \n\t"
2044
                "add $24, %%"REG_d"                \n\t"
2045
                "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2046
                "psraw $7, %%mm4                \n\t"
2047
                
2048
                "movq %%mm0, %%mm1                \n\t"
2049
                "punpckldq %%mm4, %%mm0                \n\t"
2050
                "punpckhdq %%mm4, %%mm1                \n\t"
2051
                "packsswb %%mm1, %%mm0                \n\t"
2052
                "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2053

    
2054
                "movd %%mm0, (%1, %%"REG_a")        \n\t"
2055
                "punpckhdq %%mm0, %%mm0                \n\t"
2056
                "movd %%mm0, (%2, %%"REG_a")        \n\t"
2057
                "add $4, %%"REG_a"                \n\t"
2058
                " js 1b                                \n\t"
2059
                : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2060
                : "%"REG_a, "%"REG_d
2061
        );
2062
#else
2063
        int i;
2064
        for(i=0; i<width; i++)
2065
        {
2066
                int b= src1[6*i + 0] + src1[6*i + 3];
2067
                int g= src1[6*i + 1] + src1[6*i + 4];
2068
                int r= src1[6*i + 2] + src1[6*i + 5];
2069

    
2070
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2072
        }
2073
#endif
2074
        assert(src1 == src2);
2075
}
2076

    
2077
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2078
{
2079
        int i;
2080
        for(i=0; i<width; i++)
2081
        {
2082
                int d= ((uint16_t*)src)[i];
2083
                int b= d&0x1F;
2084
                int g= (d>>5)&0x3F;
2085
                int r= (d>>11)&0x1F;
2086

    
2087
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2088
        }
2089
}
2090

    
2091
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2092
{
2093
        int i;
2094
        assert(src1==src2);
2095
        for(i=0; i<width; i++)
2096
        {
2097
                int d0= ((uint32_t*)src1)[i];
2098
                
2099
                int dl= (d0&0x07E0F81F);
2100
                int dh= ((d0>>5)&0x07C0F83F);
2101

    
2102
                int dh2= (dh>>11) + (dh<<21);
2103
                int d= dh2 + dl;
2104

    
2105
                int b= d&0x7F;
2106
                int r= (d>>11)&0x7F;
2107
                int g= d>>21;
2108
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2110
        }
2111
}
2112

    
2113
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2114
{
2115
        int i;
2116
        for(i=0; i<width; i++)
2117
        {
2118
                int d= ((uint16_t*)src)[i];
2119
                int b= d&0x1F;
2120
                int g= (d>>5)&0x1F;
2121
                int r= (d>>10)&0x1F;
2122

    
2123
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2124
        }
2125
}
2126

    
2127
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2128
{
2129
        int i;
2130
        assert(src1==src2);
2131
        for(i=0; i<width; i++)
2132
        {
2133
                int d0= ((uint32_t*)src1)[i];
2134
                
2135
                int dl= (d0&0x03E07C1F);
2136
                int dh= ((d0>>5)&0x03E0F81F);
2137

    
2138
                int dh2= (dh>>11) + (dh<<21);
2139
                int d= dh2 + dl;
2140

    
2141
                int b= d&0x7F;
2142
                int r= (d>>10)&0x7F;
2143
                int g= d>>21;
2144
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2146
        }
2147
}
2148

    
2149

    
2150
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2151
{
2152
        int i;
2153
        for(i=0; i<width; i++)
2154
        {
2155
                int r=  ((uint32_t*)src)[i]&0xFF;
2156
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
2157
                int b= (((uint32_t*)src)[i]>>16)&0xFF;
2158

    
2159
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2160
        }
2161
}
2162

    
2163
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2164
{
2165
        int i;
2166
        assert(src1==src2);
2167
        for(i=0; i<width; i++)
2168
        {
2169
                const int a= ((uint32_t*)src1)[2*i+0];
2170
                const int e= ((uint32_t*)src1)[2*i+1];
2171
                const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172
                const int h= (a&0x00FF00) + (e&0x00FF00);
2173
                 const int r=  l&0x3FF;
2174
                const int g=  h>>8;
2175
                const int b=  l>>16;
2176

    
2177
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2179
        }
2180
}
2181

    
2182
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2183
{
2184
        int i;
2185
        for(i=0; i<width; i++)
2186
        {
2187
                int r= src[i*3+0];
2188
                int g= src[i*3+1];
2189
                int b= src[i*3+2];
2190

    
2191
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2192
        }
2193
}
2194

    
2195
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2196
{
2197
        int i;
2198
        assert(src1==src2);
2199
        for(i=0; i<width; i++)
2200
        {
2201
                int r= src1[6*i + 0] + src1[6*i + 3];
2202
                int g= src1[6*i + 1] + src1[6*i + 4];
2203
                int b= src1[6*i + 2] + src1[6*i + 5];
2204

    
2205
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2207
        }
2208
}
2209

    
2210
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2211
{
2212
        int i;
2213
        for(i=0; i<width; i++)
2214
        {
2215
                int d= ((uint16_t*)src)[i];
2216
                int r= d&0x1F;
2217
                int g= (d>>5)&0x3F;
2218
                int b= (d>>11)&0x1F;
2219

    
2220
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2221
        }
2222
}
2223

    
2224
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2225
{
2226
        int i;
2227
        assert(src1 == src2);
2228
        for(i=0; i<width; i++)
2229
        {
2230
                int d0= ((uint32_t*)src1)[i];
2231
                
2232
                int dl= (d0&0x07E0F81F);
2233
                int dh= ((d0>>5)&0x07C0F83F);
2234

    
2235
                int dh2= (dh>>11) + (dh<<21);
2236
                int d= dh2 + dl;
2237

    
2238
                int r= d&0x7F;
2239
                int b= (d>>11)&0x7F;
2240
                int g= d>>21;
2241
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2242
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2243
        }
2244
}
2245

    
2246
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2247
{
2248
        int i;
2249
        for(i=0; i<width; i++)
2250
        {
2251
                int d= ((uint16_t*)src)[i];
2252
                int r= d&0x1F;
2253
                int g= (d>>5)&0x1F;
2254
                int b= (d>>10)&0x1F;
2255

    
2256
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2257
        }
2258
}
2259

    
2260
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2261
{
2262
        int i;
2263
        assert(src1 == src2);
2264
        for(i=0; i<width; i++)
2265
        {
2266
                int d0= ((uint32_t*)src1)[i];
2267
                
2268
                int dl= (d0&0x03E07C1F);
2269
                int dh= ((d0>>5)&0x03E0F81F);
2270

    
2271
                int dh2= (dh>>11) + (dh<<21);
2272
                int d= dh2 + dl;
2273

    
2274
                int g= d&0x7F;
2275
                int r= (d>>10)&0x7F;
2276
                int b= d>>21;
2277
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2278
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2279
        }
2280
}
2281

    
2282
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2283
{
2284
        int i;
2285
        for(i=0; i<width; i++)
2286
        {
2287
                int d= src[i];
2288
                int b= pal[d]     &0xFF;
2289
                int g=(pal[d]>>8 )&0xFF;
2290
                int r= pal[d]>>16;
2291

    
2292
                dst[i]= ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2293
        }
2294
}
2295

    
2296
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2297
{
2298
        int i;
2299
        assert(src1 == src2);
2300
        for(i=0; i<width; i++)
2301
        {
2302
                int d0= src1[2*i  ];
2303
                int d1= src1[2*i+1];
2304
                int p = (pal[d0]&0xFF00FF) + (pal[d1]&0xFF00FF);
2305
                int g = (pal[d0]+pal[d1]-p)>>8;
2306
                int b= p&0x1FF;
2307
                int r= p>>16;
2308

    
2309
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2310
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2311
        }
2312
}
2313

    
2314
// Bilinear / Bicubic scaling
2315
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2316
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2317
{
2318
#ifdef HAVE_MMX
2319
        assert(filterSize % 4 == 0 && filterSize>0);
2320
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2321
        {
2322
                long counter= -2*dstW;
2323
                filter-= counter*2;
2324
                filterPos-= counter/2;
2325
                dst-= counter/2;
2326
                asm volatile(
2327
#if defined(PIC)
2328
                        "push %%"REG_b"                   \n\t"
2329
#endif
2330
                        "pxor %%mm7, %%mm7                \n\t"
2331
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2332
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2333
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2334
                        ASMALIGN(4)
2335
                        "1:                                \n\t"
2336
                        "movzwl (%2, %%"REG_BP"), %%eax        \n\t"
2337
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2338
                        "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2339
                        "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2340
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2341
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2342
                        "punpcklbw %%mm7, %%mm0                \n\t"
2343
                        "punpcklbw %%mm7, %%mm2                \n\t"
2344
                        "pmaddwd %%mm1, %%mm0                \n\t"
2345
                        "pmaddwd %%mm2, %%mm3                \n\t"
2346
                        "psrad $8, %%mm0                \n\t"
2347
                        "psrad $8, %%mm3                \n\t"
2348
                        "packssdw %%mm3, %%mm0                \n\t"
2349
                        "pmaddwd %%mm6, %%mm0                \n\t"
2350
                        "packssdw %%mm0, %%mm0                \n\t"
2351
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2352
                        "add $4, %%"REG_BP"                \n\t"
2353
                        " jnc 1b                        \n\t"
2354

    
2355
                        "pop %%"REG_BP"                        \n\t"
2356
#if defined(PIC)
2357
                        "pop %%"REG_b"                   \n\t"
2358
#endif
2359
                        : "+a" (counter)
2360
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2361
#if !defined(PIC)
2362
                        : "%"REG_b
2363
#endif
2364
                );
2365
        }
2366
        else if(filterSize==8)
2367
        {
2368
                long counter= -2*dstW;
2369
                filter-= counter*4;
2370
                filterPos-= counter/2;
2371
                dst-= counter/2;
2372
                asm volatile(
2373
#if defined(PIC)
2374
                        "push %%"REG_b"                   \n\t"
2375
#endif
2376
                        "pxor %%mm7, %%mm7                \n\t"
2377
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2378
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2379
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2380
                        ASMALIGN(4)
2381
                        "1:                                \n\t"
2382
                        "movzwl (%2, %%"REG_BP"), %%eax        \n\t"
2383
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2384
                        "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2385
                        "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2386
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2387
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2388
                        "punpcklbw %%mm7, %%mm0                \n\t"
2389
                        "punpcklbw %%mm7, %%mm2                \n\t"
2390
                        "pmaddwd %%mm1, %%mm0                \n\t"
2391
                        "pmaddwd %%mm2, %%mm3                \n\t"
2392

    
2393
                        "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2394
                        "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2395
                        "movd 4(%3, %%"REG_a"), %%mm4        \n\t"
2396
                        "movd 4(%3, %%"REG_b"), %%mm2        \n\t"
2397
                        "punpcklbw %%mm7, %%mm4                \n\t"
2398
                        "punpcklbw %%mm7, %%mm2                \n\t"
2399
                        "pmaddwd %%mm1, %%mm4                \n\t"
2400
                        "pmaddwd %%mm2, %%mm5                \n\t"
2401
                        "paddd %%mm4, %%mm0                \n\t"
2402
                        "paddd %%mm5, %%mm3                \n\t"
2403
                                                
2404
                        "psrad $8, %%mm0                \n\t"
2405
                        "psrad $8, %%mm3                \n\t"
2406
                        "packssdw %%mm3, %%mm0                \n\t"
2407
                        "pmaddwd %%mm6, %%mm0                \n\t"
2408
                        "packssdw %%mm0, %%mm0                \n\t"
2409
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2410
                        "add $4, %%"REG_BP"                \n\t"
2411
                        " jnc 1b                        \n\t"
2412

    
2413
                        "pop %%"REG_BP"                        \n\t"
2414
#if defined(PIC)
2415
                        "pop %%"REG_b"                   \n\t"
2416
#endif
2417
                        : "+a" (counter)
2418
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2419
#if !defined(PIC)
2420
                        : "%"REG_b
2421
#endif
2422
                );
2423
        }
2424
        else
2425
        {
2426
                uint8_t *offset = src+filterSize;
2427
                long counter= -2*dstW;
2428
//                filter-= counter*filterSize/2;
2429
                filterPos-= counter/2;
2430
                dst-= counter/2;
2431
                asm volatile(
2432
                        "pxor %%mm7, %%mm7                \n\t"
2433
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2434
                        ASMALIGN(4)
2435
                        "1:                                \n\t"
2436
                        "mov %2, %%"REG_c"                \n\t"
2437
                        "movzwl (%%"REG_c", %0), %%eax        \n\t"
2438
                        "movzwl 2(%%"REG_c", %0), %%edx        \n\t"
2439
                        "mov %5, %%"REG_c"                \n\t"
2440
                        "pxor %%mm4, %%mm4                \n\t"
2441
                        "pxor %%mm5, %%mm5                \n\t"
2442
                        "2:                                \n\t"
2443
                        "movq (%1), %%mm1                \n\t"
2444
                        "movq (%1, %6), %%mm3                \n\t"
2445
                        "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2446
                        "movd (%%"REG_c", %%"REG_d"), %%mm2\n\t"
2447
                        "punpcklbw %%mm7, %%mm0                \n\t"
2448
                        "punpcklbw %%mm7, %%mm2                \n\t"
2449
                        "pmaddwd %%mm1, %%mm0                \n\t"
2450
                        "pmaddwd %%mm2, %%mm3                \n\t"
2451
                        "paddd %%mm3, %%mm5                \n\t"
2452
                        "paddd %%mm0, %%mm4                \n\t"
2453
                        "add $8, %1                        \n\t"
2454
                        "add $4, %%"REG_c"                \n\t"
2455
                        "cmp %4, %%"REG_c"                \n\t"
2456
                        " jb 2b                                \n\t"
2457
                        "add %6, %1                        \n\t"
2458
                        "psrad $8, %%mm4                \n\t"
2459
                        "psrad $8, %%mm5                \n\t"
2460
                        "packssdw %%mm5, %%mm4                \n\t"
2461
                        "pmaddwd %%mm6, %%mm4                \n\t"
2462
                        "packssdw %%mm4, %%mm4                \n\t"
2463
                        "mov %3, %%"REG_a"                \n\t"
2464
                        "movd %%mm4, (%%"REG_a", %0)        \n\t"
2465
                        "add $4, %0                        \n\t"
2466
                        " jnc 1b                        \n\t"
2467

    
2468
                        : "+r" (counter), "+r" (filter)
2469
                        : "m" (filterPos), "m" (dst), "m"(offset),
2470
                          "m" (src), "r" (filterSize*2)
2471
                        : "%"REG_a, "%"REG_c, "%"REG_d
2472
                );
2473
        }
2474
#else
2475
#ifdef HAVE_ALTIVEC
2476
        hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2477
#else
2478
        int i;
2479
        for(i=0; i<dstW; i++)
2480
        {
2481
                int j;
2482
                int srcPos= filterPos[i];
2483
                int val=0;
2484
//                printf("filterPos: %d\n", filterPos[i]);
2485
                for(j=0; j<filterSize; j++)
2486
                {
2487
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2488
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2489
                }
2490
//                filter += hFilterSize;
2491
                dst[i] = clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2492
//                dst[i] = val>>7;
2493
        }
2494
#endif
2495
#endif
2496
}
2497
      // *** horizontal scale Y line to temp buffer
2498
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2499
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2500
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2501
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2502
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2503
{
2504
    if(srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2505
    {
2506
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2507
        src= formatConvBuffer;
2508
    }
2509
    else if(srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2510
    {
2511
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2512
        src= formatConvBuffer;
2513
    }
2514
    else if(srcFormat==PIX_FMT_RGB32)
2515
    {
2516
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2517
        src= formatConvBuffer;
2518
    }
2519
    else if(srcFormat==PIX_FMT_BGR24)
2520
    {
2521
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2522
        src= formatConvBuffer;
2523
    }
2524
    else if(srcFormat==PIX_FMT_BGR565)
2525
    {
2526
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2527
        src= formatConvBuffer;
2528
    }
2529
    else if(srcFormat==PIX_FMT_BGR555)
2530
    {
2531
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2532
        src= formatConvBuffer;
2533
    }
2534
    else if(srcFormat==PIX_FMT_BGR32)
2535
    {
2536
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2537
        src= formatConvBuffer;
2538
    }
2539
    else if(srcFormat==PIX_FMT_RGB24)
2540
    {
2541
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2542
        src= formatConvBuffer;
2543
    }
2544
    else if(srcFormat==PIX_FMT_RGB565)
2545
    {
2546
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2547
        src= formatConvBuffer;
2548
    }
2549
    else if(srcFormat==PIX_FMT_RGB555)
2550
    {
2551
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2552
        src= formatConvBuffer;
2553
    }
2554
    else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8)
2555
    {
2556
        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2557
        src= formatConvBuffer;
2558
    }
2559

    
2560
#ifdef HAVE_MMX
2561
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2562
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2563
#else
2564
    if(!(flags&SWS_FAST_BILINEAR))
2565
#endif
2566
    {
2567
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2568
    }
2569
    else // Fast Bilinear upscale / crap downscale
2570
    {
2571
#if defined(ARCH_X86)
2572
#ifdef HAVE_MMX2
2573
        int i;
2574
#if defined(PIC)
2575
        uint64_t ebxsave __attribute__((aligned(8)));
2576
#endif
2577
        if(canMMX2BeUsed)
2578
        {
2579
                asm volatile(
2580
#if defined(PIC)
2581
                        "mov %%"REG_b", %5    \n\t"
2582
#endif
2583
                        "pxor %%mm7, %%mm7                \n\t"
2584
                        "mov %0, %%"REG_c"                \n\t"
2585
                        "mov %1, %%"REG_D"                \n\t"
2586
                        "mov %2, %%"REG_d"                \n\t"
2587
                        "mov %3, %%"REG_b"                \n\t"
2588
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2589
                        PREFETCH" (%%"REG_c")                \n\t"
2590
                        PREFETCH" 32(%%"REG_c")                \n\t"
2591
                        PREFETCH" 64(%%"REG_c")                \n\t"
2592

    
2593
#ifdef ARCH_X86_64
2594

    
2595
#define FUNNY_Y_CODE \
2596
                        "movl (%%"REG_b"), %%esi        \n\t"\
2597
                        "call *%4                        \n\t"\
2598
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2599
                        "add %%"REG_S", %%"REG_c"        \n\t"\
2600
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2601
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2602

    
2603
#else
2604

    
2605
#define FUNNY_Y_CODE \
2606
                        "movl (%%"REG_b"), %%esi        \n\t"\
2607
                        "call *%4                        \n\t"\
2608
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2609
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2610
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2611

    
2612
#endif
2613

    
2614
FUNNY_Y_CODE
2615
FUNNY_Y_CODE
2616
FUNNY_Y_CODE
2617
FUNNY_Y_CODE
2618
FUNNY_Y_CODE
2619
FUNNY_Y_CODE
2620
FUNNY_Y_CODE
2621
FUNNY_Y_CODE
2622

    
2623
#if defined(PIC)
2624
                        "mov %5, %%"REG_b"    \n\t"
2625
#endif
2626
                        :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2627
                        "m" (funnyYCode)
2628
#if defined(PIC)
2629
                        ,"m" (ebxsave)
2630
#endif
2631
                        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2632
#if !defined(PIC)
2633
                        ,"%"REG_b
2634
#endif
2635
                );
2636
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2637
        }
2638
        else
2639
        {
2640
#endif
2641
        long xInc_shr16 = xInc >> 16;
2642
        uint16_t xInc_mask = xInc & 0xffff;
2643
        //NO MMX just normal asm ...
2644
        asm volatile(
2645
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2646
                "xor %%"REG_d", %%"REG_d"        \n\t" // xx
2647
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2648
                ASMALIGN(4)
2649
                "1:                                \n\t"
2650
                "movzbl  (%0, %%"REG_d"), %%edi        \n\t" //src[xx]
2651
                "movzbl 1(%0, %%"REG_d"), %%esi        \n\t" //src[xx+1]
2652
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2653
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2654
                "shll $16, %%edi                \n\t"
2655
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2656
                "mov %1, %%"REG_D"                \n\t"
2657
                "shrl $9, %%esi                        \n\t"
2658
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2659
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2660
                "adc %3, %%"REG_d"                \n\t" //xx+= xInc>>8 + carry
2661

    
2662
                "movzbl (%0, %%"REG_d"), %%edi        \n\t" //src[xx]
2663
                "movzbl 1(%0, %%"REG_d"), %%esi        \n\t" //src[xx+1]
2664
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2665
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2666
                "shll $16, %%edi                \n\t"
2667
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2668
                "mov %1, %%"REG_D"                \n\t"
2669
                "shrl $9, %%esi                        \n\t"
2670
                "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2671
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2672
                "adc %3, %%"REG_d"                \n\t" //xx+= xInc>>8 + carry
2673

    
2674

    
2675
                "add $2, %%"REG_a"                \n\t"
2676
                "cmp %2, %%"REG_a"                \n\t"
2677
                " jb 1b                                \n\t"
2678

    
2679

    
2680
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2681
                : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2682
                );
2683
#ifdef HAVE_MMX2
2684
        } //if MMX2 can't be used
2685
#endif
2686
#else
2687
        int i;
2688
        unsigned int xpos=0;
2689
        for(i=0;i<dstWidth;i++)
2690
        {
2691
                register unsigned int xx=xpos>>16;
2692
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2693
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2694
                xpos+=xInc;
2695
        }
2696
#endif
2697
    }
2698
}
2699

    
2700
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2701
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2702
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2703
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2704
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2705
{
2706
    if(srcFormat==PIX_FMT_YUYV422)
2707
    {
2708
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2709
        src1= formatConvBuffer;
2710
        src2= formatConvBuffer+2048;
2711
    }
2712
    else if(srcFormat==PIX_FMT_UYVY422)
2713
    {
2714
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2715
        src1= formatConvBuffer;
2716
        src2= formatConvBuffer+2048;
2717
    }
2718
    else if(srcFormat==PIX_FMT_RGB32)
2719
    {
2720
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2721
        src1= formatConvBuffer;
2722
        src2= formatConvBuffer+2048;
2723
    }
2724
    else if(srcFormat==PIX_FMT_BGR24)
2725
    {
2726
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2727
        src1= formatConvBuffer;
2728
        src2= formatConvBuffer+2048;
2729
    }
2730
    else if(srcFormat==PIX_FMT_BGR565)
2731
    {
2732
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2733
        src1= formatConvBuffer;
2734
        src2= formatConvBuffer+2048;
2735
    }
2736
    else if(srcFormat==PIX_FMT_BGR555)
2737
    {
2738
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2739
        src1= formatConvBuffer;
2740
        src2= formatConvBuffer+2048;
2741
    }
2742
    else if(srcFormat==PIX_FMT_BGR32)
2743
    {
2744
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2745
        src1= formatConvBuffer;
2746
        src2= formatConvBuffer+2048;
2747
    }
2748
    else if(srcFormat==PIX_FMT_RGB24)
2749
    {
2750
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2751
        src1= formatConvBuffer;
2752
        src2= formatConvBuffer+2048;
2753
    }
2754
    else if(srcFormat==PIX_FMT_RGB565)
2755
    {
2756
        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2757
        src1= formatConvBuffer;
2758
        src2= formatConvBuffer+2048;
2759
    }
2760
    else if(srcFormat==PIX_FMT_RGB555)
2761
    {
2762
        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2763
        src1= formatConvBuffer;
2764
        src2= formatConvBuffer+2048;
2765
    }
2766
    else if(isGray(srcFormat))
2767
    {
2768
            return;
2769
    }
2770
    else if(srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8)
2771
    {
2772
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2773
        src1= formatConvBuffer;
2774
        src2= formatConvBuffer+2048;
2775
    }
2776

    
2777
#ifdef HAVE_MMX
2778
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2779
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2780
#else
2781
    if(!(flags&SWS_FAST_BILINEAR))
2782
#endif
2783
    {
2784
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2785
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2786
    }
2787
    else // Fast Bilinear upscale / crap downscale
2788
    {
2789
#if defined(ARCH_X86)
2790
#ifdef HAVE_MMX2
2791
        int i;
2792
#if defined(PIC)
2793
        uint64_t ebxsave __attribute__((aligned(8)));
2794
#endif
2795
        if(canMMX2BeUsed)
2796
        {
2797
                asm volatile(
2798
#if defined(PIC)
2799
                        "mov %%"REG_b", %6    \n\t"
2800
#endif
2801
                        "pxor %%mm7, %%mm7                \n\t"
2802
                        "mov %0, %%"REG_c"                \n\t"
2803
                        "mov %1, %%"REG_D"                \n\t"
2804
                        "mov %2, %%"REG_d"                \n\t"
2805
                        "mov %3, %%"REG_b"                \n\t"
2806
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2807
                        PREFETCH" (%%"REG_c")                \n\t"
2808
                        PREFETCH" 32(%%"REG_c")                \n\t"
2809
                        PREFETCH" 64(%%"REG_c")                \n\t"
2810

    
2811
#ifdef ARCH_X86_64
2812

    
2813
#define FUNNY_UV_CODE \
2814
                        "movl (%%"REG_b"), %%esi        \n\t"\
2815
                        "call *%4                        \n\t"\
2816
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2817
                        "add %%"REG_S", %%"REG_c"        \n\t"\
2818
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2819
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2820

    
2821
#else
2822

    
2823
#define FUNNY_UV_CODE \
2824
                        "movl (%%"REG_b"), %%esi        \n\t"\
2825
                        "call *%4                        \n\t"\
2826
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2827
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2828
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2829

    
2830
#endif
2831

    
2832
FUNNY_UV_CODE
2833
FUNNY_UV_CODE
2834
FUNNY_UV_CODE
2835
FUNNY_UV_CODE
2836
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2837
                        "mov %5, %%"REG_c"                \n\t" // src
2838
                        "mov %1, %%"REG_D"                \n\t" // buf1
2839
                        "add $4096, %%"REG_D"                \n\t"
2840
                        PREFETCH" (%%"REG_c")                \n\t"
2841
                        PREFETCH" 32(%%"REG_c")                \n\t"
2842
                        PREFETCH" 64(%%"REG_c")                \n\t"
2843

    
2844
FUNNY_UV_CODE
2845
FUNNY_UV_CODE
2846
FUNNY_UV_CODE
2847
FUNNY_UV_CODE
2848

    
2849
#if defined(PIC)
2850
                        "mov %6, %%"REG_b"    \n\t"
2851
#endif
2852
                        :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2853
                        "m" (funnyUVCode), "m" (src2)
2854
#if defined(PIC)
2855
                        ,"m" (ebxsave)
2856
#endif
2857
                        : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2858
#if !defined(PIC)
2859
                         ,"%"REG_b
2860
#endif
2861
                );
2862
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2863
                {
2864
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2865
                        dst[i] = src1[srcW-1]*128;
2866
                        dst[i+2048] = src2[srcW-1]*128;
2867
                }
2868
        }
2869
        else
2870
        {
2871
#endif
2872
        long xInc_shr16 = (long) (xInc >> 16);
2873
        uint16_t xInc_mask = xInc & 0xffff; 
2874
        asm volatile(
2875
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2876
                "xor %%"REG_d", %%"REG_d"                \n\t" // xx
2877
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2878
                ASMALIGN(4)
2879
                "1:                                \n\t"
2880
                "mov %0, %%"REG_S"                \n\t"
2881
                "movzbl  (%%"REG_S", %%"REG_d"), %%edi        \n\t" //src[xx]
2882
                "movzbl 1(%%"REG_S", %%"REG_d"), %%esi        \n\t" //src[xx+1]
2883
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2884
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2885
                "shll $16, %%edi                \n\t"
2886
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2887
                "mov %1, %%"REG_D"                \n\t"
2888
                "shrl $9, %%esi                        \n\t"
2889
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2890

    
2891
                "movzbl  (%5, %%"REG_d"), %%edi        \n\t" //src[xx]
2892
                "movzbl 1(%5, %%"REG_d"), %%esi        \n\t" //src[xx+1]
2893
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2894
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2895
                "shll $16, %%edi                \n\t"
2896
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2897
                "mov %1, %%"REG_D"                \n\t"
2898
                "shrl $9, %%esi                        \n\t"
2899
                "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2900

    
2901
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2902
                "adc %3, %%"REG_d"                \n\t" //xx+= xInc>>8 + carry
2903
                "add $1, %%"REG_a"                \n\t"
2904
                "cmp %2, %%"REG_a"                \n\t"
2905
                " jb 1b                                \n\t"
2906

    
2907
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2908
   which is needed to support GCC-4.0 */
2909
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2910
                :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2911
#else
2912
                :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2913
#endif
2914
                "r" (src2)
2915
                : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2916
                );
2917
#ifdef HAVE_MMX2
2918
        } //if MMX2 can't be used
2919
#endif
2920
#else
2921
        int i;
2922
        unsigned int xpos=0;
2923
        for(i=0;i<dstWidth;i++)
2924
        {
2925
                register unsigned int xx=xpos>>16;
2926
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2927
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2928
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2929
/* slower
2930
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2931
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2932
*/
2933
                xpos+=xInc;
2934
        }
2935
#endif
2936
   }
2937
}
2938

    
2939
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2940
             int srcSliceH, uint8_t* dst[], int dstStride[]){
2941

    
2942
        /* load a few things into local vars to make the code more readable? and faster */
2943
        const int srcW= c->srcW;
2944
        const int dstW= c->dstW;
2945
        const int dstH= c->dstH;
2946
        const int chrDstW= c->chrDstW;
2947
        const int chrSrcW= c->chrSrcW;
2948
        const int lumXInc= c->lumXInc;
2949
        const int chrXInc= c->chrXInc;
2950
        const int dstFormat= c->dstFormat;
2951
        const int srcFormat= c->srcFormat;
2952
        const int flags= c->flags;
2953
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2954
        int16_t *vLumFilterPos= c->vLumFilterPos;
2955
        int16_t *vChrFilterPos= c->vChrFilterPos;
2956
        int16_t *hLumFilterPos= c->hLumFilterPos;
2957
        int16_t *hChrFilterPos= c->hChrFilterPos;
2958
        int16_t *vLumFilter= c->vLumFilter;
2959
        int16_t *vChrFilter= c->vChrFilter;
2960
        int16_t *hLumFilter= c->hLumFilter;
2961
        int16_t *hChrFilter= c->hChrFilter;
2962
        int32_t *lumMmxFilter= c->lumMmxFilter;
2963
        int32_t *chrMmxFilter= c->chrMmxFilter;
2964
        const int vLumFilterSize= c->vLumFilterSize;
2965
        const int vChrFilterSize= c->vChrFilterSize;
2966
        const int hLumFilterSize= c->hLumFilterSize;
2967
        const int hChrFilterSize= c->hChrFilterSize;
2968
        int16_t **lumPixBuf= c->lumPixBuf;
2969
        int16_t **chrPixBuf= c->chrPixBuf;
2970
        const int vLumBufSize= c->vLumBufSize;
2971
        const int vChrBufSize= c->vChrBufSize;
2972
        uint8_t *funnyYCode= c->funnyYCode;
2973
        uint8_t *funnyUVCode= c->funnyUVCode;
2974
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2975
        const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2976
        const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2977
        int lastDstY;
2978
        uint8_t *pal=NULL;
2979

    
2980
        /* vars whch will change and which we need to storw back in the context */
2981
        int dstY= c->dstY;
2982
        int lumBufIndex= c->lumBufIndex;
2983
        int chrBufIndex= c->chrBufIndex;
2984
        int lastInLumBuf= c->lastInLumBuf;
2985
        int lastInChrBuf= c->lastInChrBuf;
2986
        
2987
        if(isPacked(c->srcFormat)){
2988
                pal= src[1];
2989
                src[0]=
2990
                src[1]=
2991
                src[2]= src[0];
2992
                srcStride[0]=
2993
                srcStride[1]=
2994
                srcStride[2]= srcStride[0];
2995
        }
2996
        srcStride[1]<<= c->vChrDrop;
2997
        srcStride[2]<<= c->vChrDrop;
2998

    
2999
//        printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
3000
//                (int)dst[0], (int)dst[1], (int)dst[2]);
3001

    
3002
#if 0 //self test FIXME move to a vfilter or something
3003
{
3004
static volatile int i=0;
3005
i++;
3006
if(srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
3007
        selfTest(src, srcStride, c->srcW, c->srcH);
3008
i--;
3009
}
3010
#endif
3011

    
3012
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3013
//dstStride[0],dstStride[1],dstStride[2]);
3014

    
3015
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3016
        {
3017
                static int firstTime=1; //FIXME move this into the context perhaps
3018
                if(flags & SWS_PRINT_INFO && firstTime)
3019
                {
3020
                        av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
3021
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
3022
                        firstTime=0;
3023
                }
3024
        }
3025

    
3026
        /* Note the user might start scaling the picture in the middle so this will not get executed
3027
           this is not really intended but works currently, so ppl might do it */
3028
        if(srcSliceY ==0){
3029
                lumBufIndex=0;
3030
                chrBufIndex=0;
3031
                dstY=0;        
3032
                lastInLumBuf= -1;
3033
                lastInChrBuf= -1;
3034
        }
3035

    
3036
        lastDstY= dstY;
3037

    
3038
        for(;dstY < dstH; dstY++){
3039
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
3040
                const int chrDstY= dstY>>c->chrDstVSubSample;
3041
                unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3042
                unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3043

    
3044
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3045
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3046
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3047
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3048

    
3049
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3050
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3051
                //handle holes (FAST_BILINEAR & weird filters)
3052
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3053
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3054
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3055
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3056
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3057

    
3058
                // Do we have enough lines in this slice to output the dstY line
3059
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3060
                {
3061
                        //Do horizontal scaling
3062
                        while(lastInLumBuf < lastLumSrcY)
3063
                        {
3064
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3065
                                lumBufIndex++;
3066
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3067
                                ASSERT(lumBufIndex < 2*vLumBufSize)
3068
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3069
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3070
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
3071
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3072
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3073
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
3074
                                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3075
                                lastInLumBuf++;
3076
                        }
3077
                        while(lastInChrBuf < lastChrSrcY)
3078
                        {
3079
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3080
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3081
                                chrBufIndex++;
3082
                                ASSERT(chrBufIndex < 2*vChrBufSize)
3083
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3084
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3085
                                //FIXME replace parameters through context struct (some at least)
3086

    
3087
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
3088
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3089
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3090
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
3091
                                                c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3092
                                lastInChrBuf++;
3093
                        }
3094
                        //wrap buf index around to stay inside the ring buffer
3095
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3096
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3097
                }
3098
                else // not enough lines left in this slice -> load the rest in the buffer
3099
                {
3100
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3101
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3102
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3103
                        vChrBufSize, vLumBufSize);*/
3104

    
3105
                        //Do horizontal scaling
3106
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3107
                        {
3108
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3109
                                lumBufIndex++;
3110
                                ASSERT(lumBufIndex < 2*vLumBufSize)
3111
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3112
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3113
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3114
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3115
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
3116
                                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3117
                                lastInLumBuf++;
3118
                        }
3119
                        while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3120
                        {
3121
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3122
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3123
                                chrBufIndex++;
3124
                                ASSERT(chrBufIndex < 2*vChrBufSize)
3125
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3126
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3127

    
3128
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
3129
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3130
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3131
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
3132
                                                c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3133
                                lastInChrBuf++;
3134
                        }
3135
                        //wrap buf index around to stay inside the ring buffer
3136
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3137
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3138
                        break; //we can't output a dstY line so let's try with the next slice
3139
                }
3140

    
3141
#ifdef HAVE_MMX
3142
                b5Dither= dither8[dstY&1];
3143
                g6Dither= dither4[dstY&1];
3144
                g5Dither= dither8[dstY&1];
3145
                r5Dither= dither8[(dstY+1)&1];
3146
#endif
3147
            if(dstY < dstH-2)
3148
            {
3149
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3150
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3151
#ifdef HAVE_MMX
3152
                int i;
3153
            if(flags & SWS_ACCURATE_RND){
3154
                        for(i=0; i<vLumFilterSize; i+=2){
3155
                                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
3156
                                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3157
                                lumMmxFilter[2*i+2]=
3158
                                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3159
                                                + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3160
                        }
3161
                        for(i=0; i<vChrFilterSize; i+=2){
3162
                                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
3163
                                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3164
                                chrMmxFilter[2*i+2]=
3165
                                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3166
                                                + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3167
                        }
3168
            }else{
3169
                for(i=0; i<vLumFilterSize; i++)
3170
                {
3171
                        lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3172
                        lumMmxFilter[4*i+2]= 
3173
                        lumMmxFilter[4*i+3]= 
3174
                                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3175
                }
3176
                for(i=0; i<vChrFilterSize; i++)
3177
                {