Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 4bff9ef9

History | View | Annotate | Download (102 KB)

1
/*
2
    Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3

4
    This program is free software; you can redistribute it and/or modify
5
    it under the terms of the GNU General Public License as published by
6
    the Free Software Foundation; either version 2 of the License, or
7
    (at your option) any later version.
8

9
    This program is distributed in the hope that it will be useful,
10
    but WITHOUT ANY WARRANTY; without even the implied warranty of
11
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
    GNU General Public License for more details.
13

14
    You should have received a copy of the GNU General Public License
15
    along with this program; if not, write to the Free Software
16
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
*/
18

    
19
#undef REAL_MOVNTQ
20
#undef MOVNTQ
21
#undef PAVGB
22
#undef PREFETCH
23
#undef PREFETCHW
24
#undef EMMS
25
#undef SFENCE
26

    
27
#ifdef HAVE_3DNOW
28
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
29
#define EMMS     "femms"
30
#else
31
#define EMMS     "emms"
32
#endif
33

    
34
#ifdef HAVE_3DNOW
35
#define PREFETCH  "prefetch"
36
#define PREFETCHW "prefetchw"
37
#elif defined ( HAVE_MMX2 )
38
#define PREFETCH "prefetchnta"
39
#define PREFETCHW "prefetcht0"
40
#else
41
#define PREFETCH "/nop"
42
#define PREFETCHW "/nop"
43
#endif
44

    
45
#ifdef HAVE_MMX2
46
#define SFENCE "sfence"
47
#else
48
#define SFENCE "/nop"
49
#endif
50

    
51
#ifdef HAVE_MMX2
52
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
53
#elif defined (HAVE_3DNOW)
54
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
55
#endif
56

    
57
#ifdef HAVE_MMX2
58
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
59
#else
60
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
61
#endif
62
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
63

    
64
#ifdef HAVE_ALTIVEC
65
#include "swscale_altivec_template.c"
66
#endif
67

    
68
#define YSCALEYUV2YV12X(x, offset, dest, width) \
69
                asm volatile(\
70
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
71
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
72
                        "movq %%mm3, %%mm4                \n\t"\
73
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
74
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
75
                        ASMALIGN(4) /* FIXME Unroll? */\
76
                        "1:                                \n\t"\
77
                        "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
78
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
79
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5\n\t" /* srcData */\
80
                        "add $16, %%"REG_d"                \n\t"\
81
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
82
                        "test %%"REG_S", %%"REG_S"        \n\t"\
83
                        "pmulhw %%mm0, %%mm2                \n\t"\
84
                        "pmulhw %%mm0, %%mm5                \n\t"\
85
                        "paddw %%mm2, %%mm3                \n\t"\
86
                        "paddw %%mm5, %%mm4                \n\t"\
87
                        " jnz 1b                        \n\t"\
88
                        "psraw $3, %%mm3                \n\t"\
89
                        "psraw $3, %%mm4                \n\t"\
90
                        "packuswb %%mm4, %%mm3                \n\t"\
91
                        MOVNTQ(%%mm3, (%1, %%REGa))\
92
                        "add $8, %%"REG_a"                \n\t"\
93
                        "cmp %2, %%"REG_a"                \n\t"\
94
                        "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
95
                        "movq %%mm3, %%mm4                \n\t"\
96
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
97
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
98
                        "jb 1b                                \n\t"\
99
                        :: "r" (&c->redDither),\
100
                        "r" (dest), "p" (width)\
101
                        : "%"REG_a, "%"REG_d, "%"REG_S\
102
                );
103

    
104
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
105
                asm volatile(\
106
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
107
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
108
                        "pxor %%mm4, %%mm4              \n\t"\
109
                        "pxor %%mm5, %%mm5              \n\t"\
110
                        "pxor %%mm6, %%mm6              \n\t"\
111
                        "pxor %%mm7, %%mm7              \n\t"\
112
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
113
                        ASMALIGN(4) \
114
                        "1:                                \n\t"\
115
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm0\n\t" /* srcData */\
116
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
117
                        "mov 4(%%"REG_d"), %%"REG_S"        \n\t"\
118
                        "movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm1\n\t" /* srcData */\
119
                        "movq %%mm0, %%mm3              \n\t"\
120
                        "punpcklwd %%mm1, %%mm0        \n\t"\
121
                        "punpckhwd %%mm1, %%mm3        \n\t"\
122
                        "movq 8(%%"REG_d"), %%mm1        \n\t" /* filterCoeff */\
123
                        "pmaddwd %%mm1, %%mm0           \n\t"\
124
                        "pmaddwd %%mm1, %%mm3           \n\t"\
125
                        "paddd %%mm0, %%mm4             \n\t"\
126
                        "paddd %%mm3, %%mm5             \n\t"\
127
                        "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3\n\t" /* srcData */\
128
                        "mov 16(%%"REG_d"), %%"REG_S"        \n\t"\
129
                        "add $16, %%"REG_d"                \n\t"\
130
                        "test %%"REG_S", %%"REG_S"      \n\t"\
131
                        "movq %%mm2, %%mm0              \n\t"\
132
                        "punpcklwd %%mm3, %%mm2        \n\t"\
133
                        "punpckhwd %%mm3, %%mm0        \n\t"\
134
                        "pmaddwd %%mm1, %%mm2           \n\t"\
135
                        "pmaddwd %%mm1, %%mm0           \n\t"\
136
                        "paddd %%mm2, %%mm6             \n\t"\
137
                        "paddd %%mm0, %%mm7             \n\t"\
138
                        " jnz 1b                        \n\t"\
139
                        "psrad $16, %%mm4                \n\t"\
140
                        "psrad $16, %%mm5                \n\t"\
141
                        "psrad $16, %%mm6                \n\t"\
142
                        "psrad $16, %%mm7                \n\t"\
143
                        "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
144
                        "packssdw %%mm5, %%mm4                \n\t"\
145
                        "packssdw %%mm7, %%mm6                \n\t"\
146
                        "paddw %%mm0, %%mm4             \n\t"\
147
                        "paddw %%mm0, %%mm6             \n\t"\
148
                        "psraw $3, %%mm4                \n\t"\
149
                        "psraw $3, %%mm6                \n\t"\
150
                        "packuswb %%mm6, %%mm4                \n\t"\
151
                        MOVNTQ(%%mm4, (%1, %%REGa))\
152
                        "add $8, %%"REG_a"                \n\t"\
153
                        "cmp %2, %%"REG_a"                \n\t"\
154
                        "lea " offset "(%0), %%"REG_d"        \n\t"\
155
                        "pxor %%mm4, %%mm4              \n\t"\
156
                        "pxor %%mm5, %%mm5              \n\t"\
157
                        "pxor %%mm6, %%mm6              \n\t"\
158
                        "pxor %%mm7, %%mm7              \n\t"\
159
                        "mov (%%"REG_d"), %%"REG_S"        \n\t"\
160
                        "jb 1b                                \n\t"\
161
                        :: "r" (&c->redDither),\
162
                        "r" (dest), "p" (width)\
163
                        : "%"REG_a, "%"REG_d, "%"REG_S\
164
                );
165

    
166
#define YSCALEYUV2YV121 \
167
                        "mov %2, %%"REG_a"                \n\t"\
168
                        ASMALIGN(4) /* FIXME Unroll? */\
169
                        "1:                                \n\t"\
170
                        "movq (%0, %%"REG_a", 2), %%mm0        \n\t"\
171
                        "movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
172
                        "psraw $7, %%mm0                \n\t"\
173
                        "psraw $7, %%mm1                \n\t"\
174
                        "packuswb %%mm1, %%mm0                \n\t"\
175
                        MOVNTQ(%%mm0, (%1, %%REGa))\
176
                        "add $8, %%"REG_a"                \n\t"\
177
                        "jnc 1b                                \n\t"
178

    
179
/*
180
                        :: "m" (-lumFilterSize), "m" (-chrFilterSize),
181
                           "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
182
                           "r" (dest), "m" (dstW),
183
                           "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
184
                        : "%eax", "%ebx", "%ecx", "%edx", "%esi"
185
*/
186
#define YSCALEYUV2PACKEDX \
187
        asm volatile(\
188
                "xor %%"REG_a", %%"REG_a"        \n\t"\
189
                ASMALIGN(4)\
190
                "nop                                \n\t"\
191
                "1:                                \n\t"\
192
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
193
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
194
                "movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
195
                "movq %%mm3, %%mm4                \n\t"\
196
                ASMALIGN(4)\
197
                "2:                                \n\t"\
198
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
199
                "movq (%%"REG_S", %%"REG_a"), %%mm2        \n\t" /* UsrcData */\
200
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm5        \n\t" /* VsrcData */\
201
                "add $16, %%"REG_d"                \n\t"\
202
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
203
                "pmulhw %%mm0, %%mm2                \n\t"\
204
                "pmulhw %%mm0, %%mm5                \n\t"\
205
                "paddw %%mm2, %%mm3                \n\t"\
206
                "paddw %%mm5, %%mm4                \n\t"\
207
                "test %%"REG_S", %%"REG_S"        \n\t"\
208
                " jnz 2b                        \n\t"\
209
\
210
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
211
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
212
                "movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
213
                "movq %%mm1, %%mm7                \n\t"\
214
                ASMALIGN(4)\
215
                "2:                                \n\t"\
216
                "movq 8(%%"REG_d"), %%mm0        \n\t" /* filterCoeff */\
217
                "movq (%%"REG_S", %%"REG_a", 2), %%mm2        \n\t" /* Y1srcData */\
218
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5        \n\t" /* Y2srcData */\
219
                "add $16, %%"REG_d"                \n\t"\
220
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
221
                "pmulhw %%mm0, %%mm2                \n\t"\
222
                "pmulhw %%mm0, %%mm5                \n\t"\
223
                "paddw %%mm2, %%mm1                \n\t"\
224
                "paddw %%mm5, %%mm7                \n\t"\
225
                "test %%"REG_S", %%"REG_S"        \n\t"\
226
                " jnz 2b                        \n\t"\
227

    
228
#define YSCALEYUV2PACKEDX_END\
229
        :: "r" (&c->redDither), \
230
            "m" (dummy), "m" (dummy), "m" (dummy),\
231
            "r" (dest), "m" (dstW)\
232
        : "%"REG_a, "%"REG_d, "%"REG_S\
233
        );
234

    
235
#define YSCALEYUV2PACKEDX_ACCURATE \
236
        asm volatile(\
237
                "xor %%"REG_a", %%"REG_a"        \n\t"\
238
                ASMALIGN(4)\
239
                "nop                                \n\t"\
240
                "1:                                \n\t"\
241
                "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
242
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
243
                "pxor %%mm4, %%mm4              \n\t"\
244
                "pxor %%mm5, %%mm5              \n\t"\
245
                "pxor %%mm6, %%mm6              \n\t"\
246
                "pxor %%mm7, %%mm7              \n\t"\
247
                ASMALIGN(4)\
248
                "2:                                \n\t"\
249
                "movq (%%"REG_S", %%"REG_a"), %%mm0        \n\t" /* UsrcData */\
250
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm2        \n\t" /* VsrcData */\
251
                "mov 4(%%"REG_d"), %%"REG_S"        \n\t"\
252
                "movq (%%"REG_S", %%"REG_a"), %%mm1        \n\t" /* UsrcData */\
253
                "movq %%mm0, %%mm3              \n\t"\
254
                "punpcklwd %%mm1, %%mm0        \n\t"\
255
                "punpckhwd %%mm1, %%mm3        \n\t"\
256
                "movq 8(%%"REG_d"), %%mm1        \n\t" /* filterCoeff */\
257
                "pmaddwd %%mm1, %%mm0           \n\t"\
258
                "pmaddwd %%mm1, %%mm3           \n\t"\
259
                "paddd %%mm0, %%mm4             \n\t"\
260
                "paddd %%mm3, %%mm5             \n\t"\
261
                "movq 4096(%%"REG_S", %%"REG_a"), %%mm3        \n\t" /* VsrcData */\
262
                "mov 16(%%"REG_d"), %%"REG_S"        \n\t"\
263
                "add $16, %%"REG_d"                \n\t"\
264
                "test %%"REG_S", %%"REG_S"      \n\t"\
265
                "movq %%mm2, %%mm0              \n\t"\
266
                "punpcklwd %%mm3, %%mm2        \n\t"\
267
                "punpckhwd %%mm3, %%mm0        \n\t"\
268
                "pmaddwd %%mm1, %%mm2           \n\t"\
269
                "pmaddwd %%mm1, %%mm0           \n\t"\
270
                "paddd %%mm2, %%mm6             \n\t"\
271
                "paddd %%mm0, %%mm7             \n\t"\
272
                " jnz 2b                        \n\t"\
273
                "psrad $16, %%mm4                \n\t"\
274
                "psrad $16, %%mm5                \n\t"\
275
                "psrad $16, %%mm6                \n\t"\
276
                "psrad $16, %%mm7                \n\t"\
277
                "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
278
                "packssdw %%mm5, %%mm4                \n\t"\
279
                "packssdw %%mm7, %%mm6                \n\t"\
280
                "paddw %%mm0, %%mm4             \n\t"\
281
                "paddw %%mm0, %%mm6             \n\t"\
282
                "movq %%mm4, "U_TEMP"(%0)       \n\t"\
283
                "movq %%mm6, "V_TEMP"(%0)       \n\t"\
284
\
285
                "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
286
                "mov (%%"REG_d"), %%"REG_S"        \n\t"\
287
                "pxor %%mm1, %%mm1              \n\t"\
288
                "pxor %%mm5, %%mm5              \n\t"\
289
                "pxor %%mm7, %%mm7              \n\t"\
290
                "pxor %%mm6, %%mm6              \n\t"\
291
                ASMALIGN(4)\
292
                "2:                                \n\t"\
293
                "movq (%%"REG_S", %%"REG_a", 2), %%mm0        \n\t" /* Y1srcData */\
294
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2        \n\t" /* Y2srcData */\
295
                "mov 4(%%"REG_d"), %%"REG_S"        \n\t"\
296
                "movq (%%"REG_S", %%"REG_a", 2), %%mm4        \n\t" /* Y1srcData */\
297
                "movq %%mm0, %%mm3              \n\t"\
298
                "punpcklwd %%mm4, %%mm0        \n\t"\
299
                "punpckhwd %%mm4, %%mm3        \n\t"\
300
                "movq 8(%%"REG_d"), %%mm4        \n\t" /* filterCoeff */\
301
                "pmaddwd %%mm4, %%mm0           \n\t"\
302
                "pmaddwd %%mm4, %%mm3           \n\t"\
303
                "paddd %%mm0, %%mm1             \n\t"\
304
                "paddd %%mm3, %%mm5             \n\t"\
305
                "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3        \n\t" /* Y2srcData */\
306
                "mov 16(%%"REG_d"), %%"REG_S"        \n\t"\
307
                "add $16, %%"REG_d"                \n\t"\
308
                "test %%"REG_S", %%"REG_S"      \n\t"\
309
                "movq %%mm2, %%mm0              \n\t"\
310
                "punpcklwd %%mm3, %%mm2        \n\t"\
311
                "punpckhwd %%mm3, %%mm0        \n\t"\
312
                "pmaddwd %%mm4, %%mm2           \n\t"\
313
                "pmaddwd %%mm4, %%mm0           \n\t"\
314
                "paddd %%mm2, %%mm7             \n\t"\
315
                "paddd %%mm0, %%mm6             \n\t"\
316
                " jnz 2b                        \n\t"\
317
                "psrad $16, %%mm1                \n\t"\
318
                "psrad $16, %%mm5                \n\t"\
319
                "psrad $16, %%mm7                \n\t"\
320
                "psrad $16, %%mm6                \n\t"\
321
                "movq "VROUNDER_OFFSET"(%0), %%mm0\n\t"\
322
                "packssdw %%mm5, %%mm1                \n\t"\
323
                "packssdw %%mm6, %%mm7                \n\t"\
324
                "paddw %%mm0, %%mm1             \n\t"\
325
                "paddw %%mm0, %%mm7             \n\t"\
326
                "movq  "U_TEMP"(%0), %%mm3      \n\t"\
327
                "movq  "V_TEMP"(%0), %%mm4      \n\t"\
328

    
329
#define YSCALEYUV2RGBX \
330
                "psubw "U_OFFSET"(%0), %%mm3        \n\t" /* (U-128)8*/\
331
                "psubw "V_OFFSET"(%0), %%mm4        \n\t" /* (V-128)8*/\
332
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
333
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
334
                "pmulhw "UG_COEFF"(%0), %%mm3        \n\t"\
335
                "pmulhw "VG_COEFF"(%0), %%mm4        \n\t"\
336
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
337
                "pmulhw "UB_COEFF"(%0), %%mm2        \n\t"\
338
                "pmulhw "VR_COEFF"(%0), %%mm5        \n\t"\
339
                "psubw "Y_OFFSET"(%0), %%mm1        \n\t" /* 8(Y-16)*/\
340
                "psubw "Y_OFFSET"(%0), %%mm7        \n\t" /* 8(Y-16)*/\
341
                "pmulhw "Y_COEFF"(%0), %%mm1        \n\t"\
342
                "pmulhw "Y_COEFF"(%0), %%mm7        \n\t"\
343
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
344
                "paddw %%mm3, %%mm4                \n\t"\
345
                "movq %%mm2, %%mm0                \n\t"\
346
                "movq %%mm5, %%mm6                \n\t"\
347
                "movq %%mm4, %%mm3                \n\t"\
348
                "punpcklwd %%mm2, %%mm2                \n\t"\
349
                "punpcklwd %%mm5, %%mm5                \n\t"\
350
                "punpcklwd %%mm4, %%mm4                \n\t"\
351
                "paddw %%mm1, %%mm2                \n\t"\
352
                "paddw %%mm1, %%mm5                \n\t"\
353
                "paddw %%mm1, %%mm4                \n\t"\
354
                "punpckhwd %%mm0, %%mm0                \n\t"\
355
                "punpckhwd %%mm6, %%mm6                \n\t"\
356
                "punpckhwd %%mm3, %%mm3                \n\t"\
357
                "paddw %%mm7, %%mm0                \n\t"\
358
                "paddw %%mm7, %%mm6                \n\t"\
359
                "paddw %%mm7, %%mm3                \n\t"\
360
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
361
                "packuswb %%mm0, %%mm2                \n\t"\
362
                "packuswb %%mm6, %%mm5                \n\t"\
363
                "packuswb %%mm3, %%mm4                \n\t"\
364
                "pxor %%mm7, %%mm7                \n\t"
365
#if 0
366
#define FULL_YSCALEYUV2RGB \
367
                "pxor %%mm7, %%mm7                \n\t"\
368
                "movd %6, %%mm6                        \n\t" /*yalpha1*/\
369
                "punpcklwd %%mm6, %%mm6                \n\t"\
370
                "punpcklwd %%mm6, %%mm6                \n\t"\
371
                "movd %7, %%mm5                        \n\t" /*uvalpha1*/\
372
                "punpcklwd %%mm5, %%mm5                \n\t"\
373
                "punpcklwd %%mm5, %%mm5                \n\t"\
374
                "xor %%"REG_a", %%"REG_a"                \n\t"\
375
                ASMALIGN(4)\
376
                "1:                                \n\t"\
377
                "movq (%0, %%"REG_a", 2), %%mm0        \n\t" /*buf0[eax]*/\
378
                "movq (%1, %%"REG_a", 2), %%mm1        \n\t" /*buf1[eax]*/\
379
                "movq (%2, %%"REG_a",2), %%mm2        \n\t" /* uvbuf0[eax]*/\
380
                "movq (%3, %%"REG_a",2), %%mm3        \n\t" /* uvbuf1[eax]*/\
381
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
382
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
383
                "pmulhw %%mm6, %%mm0                \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
384
                "pmulhw %%mm5, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
385
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
386
                "movq 4096(%2, %%"REG_a",2), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
387
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
388
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
389
                "movq 4096(%3, %%"REG_a",2), %%mm0        \n\t" /* uvbuf1[eax+2048]*/\
390
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
391
                "psubw %%mm0, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392
                "psubw "MANGLE(w80)", %%mm1        \n\t" /* 8(Y-16)*/\
393
                "psubw "MANGLE(w400)", %%mm3        \n\t" /* 8(U-128)*/\
394
                "pmulhw "MANGLE(yCoeff)", %%mm1        \n\t"\
395
\
396
\
397
                "pmulhw %%mm5, %%mm4                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
399
                "pmulhw "MANGLE(ubCoeff)", %%mm3\n\t"\
400
                "psraw $4, %%mm0                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
401
                "pmulhw "MANGLE(ugCoeff)", %%mm2\n\t"\
402
                "paddw %%mm4, %%mm0                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
403
                "psubw "MANGLE(w400)", %%mm0        \n\t" /* (V-128)8*/\
404
\
405
\
406
                "movq %%mm0, %%mm4                \n\t" /* (V-128)8*/\
407
                "pmulhw "MANGLE(vrCoeff)", %%mm0\n\t"\
408
                "pmulhw "MANGLE(vgCoeff)", %%mm4\n\t"\
409
                "paddw %%mm1, %%mm3                \n\t" /* B*/\
410
                "paddw %%mm1, %%mm0                \n\t" /* R*/\
411
                "packuswb %%mm3, %%mm3                \n\t"\
412
\
413
                "packuswb %%mm0, %%mm0                \n\t"\
414
                "paddw %%mm4, %%mm2                \n\t"\
415
                "paddw %%mm2, %%mm1                \n\t" /* G*/\
416
\
417
                "packuswb %%mm1, %%mm1                \n\t"
418
#endif
419

    
420
#define REAL_YSCALEYUV2PACKED(index, c) \
421
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
422
                "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1\n\t"\
423
                "psraw $3, %%mm0                \n\t"\
424
                "psraw $3, %%mm1                \n\t"\
425
                "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
426
                "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
427
                "xor "#index", "#index"                \n\t"\
428
                ASMALIGN(4)\
429
                "1:                                \n\t"\
430
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
431
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
432
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
433
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
434
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
435
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
436
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
437
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
438
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
439
                "psraw $7, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
440
                "psraw $7, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
441
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
442
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
443
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
444
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
445
                "movq 8(%0, "#index", 2), %%mm6        \n\t" /*buf0[eax]*/\
446
                "movq 8(%1, "#index", 2), %%mm7        \n\t" /*buf1[eax]*/\
447
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
448
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
449
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451
                "psraw $7, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452
                "psraw $7, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
455
                
456
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
457
                
458
#define REAL_YSCALEYUV2RGB(index, c) \
459
                "xor "#index", "#index"        \n\t"\
460
                ASMALIGN(4)\
461
                "1:                                \n\t"\
462
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
463
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
464
                "movq 4096(%2, "#index"), %%mm5\n\t" /* uvbuf0[eax+2048]*/\
465
                "movq 4096(%3, "#index"), %%mm4\n\t" /* uvbuf1[eax+2048]*/\
466
                "psubw %%mm3, %%mm2                \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
467
                "psubw %%mm4, %%mm5                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
468
                "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t"\
469
                "pmulhw %%mm0, %%mm2                \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
470
                "pmulhw %%mm0, %%mm5                \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
471
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
472
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
473
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
474
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
475
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
476
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
477
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
478
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
479
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
480
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
481
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
482
                "movq (%0, "#index", 2), %%mm0        \n\t" /*buf0[eax]*/\
483
                "movq (%1, "#index", 2), %%mm1        \n\t" /*buf1[eax]*/\
484
                "movq 8(%0, "#index", 2), %%mm6\n\t" /*buf0[eax]*/\
485
                "movq 8(%1, "#index", 2), %%mm7\n\t" /*buf1[eax]*/\
486
                "psubw %%mm1, %%mm0                \n\t" /* buf0[eax] - buf1[eax]*/\
487
                "psubw %%mm7, %%mm6                \n\t" /* buf0[eax] - buf1[eax]*/\
488
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
489
                "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6\n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
490
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
491
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
492
                "paddw %%mm0, %%mm1                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
493
                "paddw %%mm6, %%mm7                \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
494
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
495
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
496
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
497
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
498
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
499
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
500
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
501
                "paddw %%mm3, %%mm4                \n\t"\
502
                "movq %%mm2, %%mm0                \n\t"\
503
                "movq %%mm5, %%mm6                \n\t"\
504
                "movq %%mm4, %%mm3                \n\t"\
505
                "punpcklwd %%mm2, %%mm2                \n\t"\
506
                "punpcklwd %%mm5, %%mm5                \n\t"\
507
                "punpcklwd %%mm4, %%mm4                \n\t"\
508
                "paddw %%mm1, %%mm2                \n\t"\
509
                "paddw %%mm1, %%mm5                \n\t"\
510
                "paddw %%mm1, %%mm4                \n\t"\
511
                "punpckhwd %%mm0, %%mm0                \n\t"\
512
                "punpckhwd %%mm6, %%mm6                \n\t"\
513
                "punpckhwd %%mm3, %%mm3                \n\t"\
514
                "paddw %%mm7, %%mm0                \n\t"\
515
                "paddw %%mm7, %%mm6                \n\t"\
516
                "paddw %%mm7, %%mm3                \n\t"\
517
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
518
                "packuswb %%mm0, %%mm2                \n\t"\
519
                "packuswb %%mm6, %%mm5                \n\t"\
520
                "packuswb %%mm3, %%mm4                \n\t"\
521
                "pxor %%mm7, %%mm7                \n\t"
522
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
523
                
524
#define REAL_YSCALEYUV2PACKED1(index, c) \
525
                "xor "#index", "#index"                \n\t"\
526
                ASMALIGN(4)\
527
                "1:                                \n\t"\
528
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
529
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
530
                "psraw $7, %%mm3                \n\t" \
531
                "psraw $7, %%mm4                \n\t" \
532
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
533
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
534
                "psraw $7, %%mm1                \n\t" \
535
                "psraw $7, %%mm7                \n\t" \
536
                
537
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
538
                
539
#define REAL_YSCALEYUV2RGB1(index, c) \
540
                "xor "#index", "#index"        \n\t"\
541
                ASMALIGN(4)\
542
                "1:                                \n\t"\
543
                "movq (%2, "#index"), %%mm3        \n\t" /* uvbuf0[eax]*/\
544
                "movq 4096(%2, "#index"), %%mm4        \n\t" /* uvbuf0[eax+2048]*/\
545
                "psraw $4, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
546
                "psraw $4, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
547
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
548
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
549
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
550
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
551
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
552
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
553
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
554
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
555
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
556
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
557
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
558
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
559
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
560
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
561
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
562
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
563
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
564
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
565
                "paddw %%mm3, %%mm4                \n\t"\
566
                "movq %%mm2, %%mm0                \n\t"\
567
                "movq %%mm5, %%mm6                \n\t"\
568
                "movq %%mm4, %%mm3                \n\t"\
569
                "punpcklwd %%mm2, %%mm2                \n\t"\
570
                "punpcklwd %%mm5, %%mm5                \n\t"\
571
                "punpcklwd %%mm4, %%mm4                \n\t"\
572
                "paddw %%mm1, %%mm2                \n\t"\
573
                "paddw %%mm1, %%mm5                \n\t"\
574
                "paddw %%mm1, %%mm4                \n\t"\
575
                "punpckhwd %%mm0, %%mm0                \n\t"\
576
                "punpckhwd %%mm6, %%mm6                \n\t"\
577
                "punpckhwd %%mm3, %%mm3                \n\t"\
578
                "paddw %%mm7, %%mm0                \n\t"\
579
                "paddw %%mm7, %%mm6                \n\t"\
580
                "paddw %%mm7, %%mm3                \n\t"\
581
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
582
                "packuswb %%mm0, %%mm2                \n\t"\
583
                "packuswb %%mm6, %%mm5                \n\t"\
584
                "packuswb %%mm3, %%mm4                \n\t"\
585
                "pxor %%mm7, %%mm7                \n\t"
586
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
587

    
588
#define REAL_YSCALEYUV2PACKED1b(index, c) \
589
                "xor "#index", "#index"                \n\t"\
590
                ASMALIGN(4)\
591
                "1:                                \n\t"\
592
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
593
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
594
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
595
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
596
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
597
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
598
                "psrlw $8, %%mm3                \n\t" \
599
                "psrlw $8, %%mm4                \n\t" \
600
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
601
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
602
                "psraw $7, %%mm1                \n\t" \
603
                "psraw $7, %%mm7                \n\t" 
604
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
605
                
606
// do vertical chrominance interpolation
607
#define REAL_YSCALEYUV2RGB1b(index, c) \
608
                "xor "#index", "#index"                \n\t"\
609
                ASMALIGN(4)\
610
                "1:                                \n\t"\
611
                "movq (%2, "#index"), %%mm2        \n\t" /* uvbuf0[eax]*/\
612
                "movq (%3, "#index"), %%mm3        \n\t" /* uvbuf1[eax]*/\
613
                "movq 4096(%2, "#index"), %%mm5        \n\t" /* uvbuf0[eax+2048]*/\
614
                "movq 4096(%3, "#index"), %%mm4        \n\t" /* uvbuf1[eax+2048]*/\
615
                "paddw %%mm2, %%mm3                \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
616
                "paddw %%mm5, %%mm4                \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
617
                "psrlw $5, %%mm3                \n\t" /*FIXME might overflow*/\
618
                "psrlw $5, %%mm4                \n\t" /*FIXME might overflow*/\
619
                "psubw "U_OFFSET"("#c"), %%mm3        \n\t" /* (U-128)8*/\
620
                "psubw "V_OFFSET"("#c"), %%mm4        \n\t" /* (V-128)8*/\
621
                "movq %%mm3, %%mm2                \n\t" /* (U-128)8*/\
622
                "movq %%mm4, %%mm5                \n\t" /* (V-128)8*/\
623
                "pmulhw "UG_COEFF"("#c"), %%mm3\n\t"\
624
                "pmulhw "VG_COEFF"("#c"), %%mm4\n\t"\
625
        /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
626
                "movq (%0, "#index", 2), %%mm1        \n\t" /*buf0[eax]*/\
627
                "movq 8(%0, "#index", 2), %%mm7        \n\t" /*buf0[eax]*/\
628
                "psraw $4, %%mm1                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
629
                "psraw $4, %%mm7                \n\t" /* buf0[eax] - buf1[eax] >>4*/\
630
                "pmulhw "UB_COEFF"("#c"), %%mm2\n\t"\
631
                "pmulhw "VR_COEFF"("#c"), %%mm5\n\t"\
632
                "psubw "Y_OFFSET"("#c"), %%mm1        \n\t" /* 8(Y-16)*/\
633
                "psubw "Y_OFFSET"("#c"), %%mm7        \n\t" /* 8(Y-16)*/\
634
                "pmulhw "Y_COEFF"("#c"), %%mm1        \n\t"\
635
                "pmulhw "Y_COEFF"("#c"), %%mm7        \n\t"\
636
        /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
637
                "paddw %%mm3, %%mm4                \n\t"\
638
                "movq %%mm2, %%mm0                \n\t"\
639
                "movq %%mm5, %%mm6                \n\t"\
640
                "movq %%mm4, %%mm3                \n\t"\
641
                "punpcklwd %%mm2, %%mm2                \n\t"\
642
                "punpcklwd %%mm5, %%mm5                \n\t"\
643
                "punpcklwd %%mm4, %%mm4                \n\t"\
644
                "paddw %%mm1, %%mm2                \n\t"\
645
                "paddw %%mm1, %%mm5                \n\t"\
646
                "paddw %%mm1, %%mm4                \n\t"\
647
                "punpckhwd %%mm0, %%mm0                \n\t"\
648
                "punpckhwd %%mm6, %%mm6                \n\t"\
649
                "punpckhwd %%mm3, %%mm3                \n\t"\
650
                "paddw %%mm7, %%mm0                \n\t"\
651
                "paddw %%mm7, %%mm6                \n\t"\
652
                "paddw %%mm7, %%mm3                \n\t"\
653
                /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
654
                "packuswb %%mm0, %%mm2                \n\t"\
655
                "packuswb %%mm6, %%mm5                \n\t"\
656
                "packuswb %%mm3, %%mm4                \n\t"\
657
                "pxor %%mm7, %%mm7                \n\t"
658
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
659

    
660
#define REAL_WRITEBGR32(dst, dstw, index) \
661
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
662
                        "movq %%mm2, %%mm1                \n\t" /* B */\
663
                        "movq %%mm5, %%mm6                \n\t" /* R */\
664
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
665
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
666
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
667
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
668
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
669
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
670
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
671
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
672
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
673
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
674
\
675
                        MOVNTQ(%%mm0, (dst, index, 4))\
676
                        MOVNTQ(%%mm2, 8(dst, index, 4))\
677
                        MOVNTQ(%%mm1, 16(dst, index, 4))\
678
                        MOVNTQ(%%mm3, 24(dst, index, 4))\
679
\
680
                        "add $8, "#index"                \n\t"\
681
                        "cmp "#dstw", "#index"                \n\t"\
682
                        " jb 1b                                \n\t"
683
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
684

    
685
#define REAL_WRITEBGR16(dst, dstw, index) \
686
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
687
                        "pand "MANGLE(bFC)", %%mm4        \n\t" /* G */\
688
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
689
                        "psrlq $3, %%mm2                \n\t"\
690
\
691
                        "movq %%mm2, %%mm1                \n\t"\
692
                        "movq %%mm4, %%mm3                \n\t"\
693
\
694
                        "punpcklbw %%mm7, %%mm3                \n\t"\
695
                        "punpcklbw %%mm5, %%mm2                \n\t"\
696
                        "punpckhbw %%mm7, %%mm4                \n\t"\
697
                        "punpckhbw %%mm5, %%mm1                \n\t"\
698
\
699
                        "psllq $3, %%mm3                \n\t"\
700
                        "psllq $3, %%mm4                \n\t"\
701
\
702
                        "por %%mm3, %%mm2                \n\t"\
703
                        "por %%mm4, %%mm1                \n\t"\
704
\
705
                        MOVNTQ(%%mm2, (dst, index, 2))\
706
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
707
\
708
                        "add $8, "#index"                \n\t"\
709
                        "cmp "#dstw", "#index"                \n\t"\
710
                        " jb 1b                                \n\t"
711
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
712

    
713
#define REAL_WRITEBGR15(dst, dstw, index) \
714
                        "pand "MANGLE(bF8)", %%mm2        \n\t" /* B */\
715
                        "pand "MANGLE(bF8)", %%mm4        \n\t" /* G */\
716
                        "pand "MANGLE(bF8)", %%mm5        \n\t" /* R */\
717
                        "psrlq $3, %%mm2                \n\t"\
718
                        "psrlq $1, %%mm5                \n\t"\
719
\
720
                        "movq %%mm2, %%mm1                \n\t"\
721
                        "movq %%mm4, %%mm3                \n\t"\
722
\
723
                        "punpcklbw %%mm7, %%mm3                \n\t"\
724
                        "punpcklbw %%mm5, %%mm2                \n\t"\
725
                        "punpckhbw %%mm7, %%mm4                \n\t"\
726
                        "punpckhbw %%mm5, %%mm1                \n\t"\
727
\
728
                        "psllq $2, %%mm3                \n\t"\
729
                        "psllq $2, %%mm4                \n\t"\
730
\
731
                        "por %%mm3, %%mm2                \n\t"\
732
                        "por %%mm4, %%mm1                \n\t"\
733
\
734
                        MOVNTQ(%%mm2, (dst, index, 2))\
735
                        MOVNTQ(%%mm1, 8(dst, index, 2))\
736
\
737
                        "add $8, "#index"                \n\t"\
738
                        "cmp "#dstw", "#index"                \n\t"\
739
                        " jb 1b                                \n\t"
740
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
741

    
742
#define WRITEBGR24OLD(dst, dstw, index) \
743
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
744
                        "movq %%mm2, %%mm1                \n\t" /* B */\
745
                        "movq %%mm5, %%mm6                \n\t" /* R */\
746
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
747
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
748
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
749
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
750
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
751
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
752
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
753
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
754
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
755
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
756
\
757
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
758
                        "psrlq $8, %%mm0                \n\t" /* 00RGB0RG 0 */\
759
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 0 */\
760
                        "pand "MANGLE(bm11111000)", %%mm0\n\t" /* 00RGB000 0.5 */\
761
                        "por %%mm4, %%mm0                \n\t" /* 00RGBRGB 0 */\
762
                        "movq %%mm2, %%mm4                \n\t" /* 0RGB0RGB 1 */\
763
                        "psllq $48, %%mm2                \n\t" /* GB000000 1 */\
764
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
765
\
766
                        "movq %%mm4, %%mm2                \n\t" /* 0RGB0RGB 1 */\
767
                        "psrld $16, %%mm4                \n\t" /* 000R000R 1 */\
768
                        "psrlq $24, %%mm2                \n\t" /* 0000RGB0 1.5 */\
769
                        "por %%mm4, %%mm2                \n\t" /* 000RRGBR 1 */\
770
                        "pand "MANGLE(bm00001111)", %%mm2\n\t" /* 0000RGBR 1 */\
771
                        "movq %%mm1, %%mm4                \n\t" /* 0RGB0RGB 2 */\
772
                        "psrlq $8, %%mm1                \n\t" /* 00RGB0RG 2 */\
773
                        "pand "MANGLE(bm00000111)", %%mm4\n\t" /* 00000RGB 2 */\
774
                        "pand "MANGLE(bm11111000)", %%mm1\n\t" /* 00RGB000 2.5 */\
775
                        "por %%mm4, %%mm1                \n\t" /* 00RGBRGB 2 */\
776
                        "movq %%mm1, %%mm4                \n\t" /* 00RGBRGB 2 */\
777
                        "psllq $32, %%mm1                \n\t" /* BRGB0000 2 */\
778
                        "por %%mm1, %%mm2                \n\t" /* BRGBRGBR 1 */\
779
\
780
                        "psrlq $32, %%mm4                \n\t" /* 000000RG 2.5 */\
781
                        "movq %%mm3, %%mm5                \n\t" /* 0RGB0RGB 3 */\
782
                        "psrlq $8, %%mm3                \n\t" /* 00RGB0RG 3 */\
783
                        "pand "MANGLE(bm00000111)", %%mm5\n\t" /* 00000RGB 3 */\
784
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" /* 00RGB000 3.5 */\
785
                        "por %%mm5, %%mm3                \n\t" /* 00RGBRGB 3 */\
786
                        "psllq $16, %%mm3                \n\t" /* RGBRGB00 3 */\
787
                        "por %%mm4, %%mm3                \n\t" /* RGBRGBRG 2.5 */\
788
\
789
                        MOVNTQ(%%mm0, (dst))\
790
                        MOVNTQ(%%mm2, 8(dst))\
791
                        MOVNTQ(%%mm3, 16(dst))\
792
                        "add $24, "#dst"                \n\t"\
793
\
794
                        "add $8, "#index"                \n\t"\
795
                        "cmp "#dstw", "#index"                \n\t"\
796
                        " jb 1b                                \n\t"
797

    
798
#define WRITEBGR24MMX(dst, dstw, index) \
799
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
800
                        "movq %%mm2, %%mm1                \n\t" /* B */\
801
                        "movq %%mm5, %%mm6                \n\t" /* R */\
802
                        "punpcklbw %%mm4, %%mm2                \n\t" /* GBGBGBGB 0 */\
803
                        "punpcklbw %%mm7, %%mm5                \n\t" /* 0R0R0R0R 0 */\
804
                        "punpckhbw %%mm4, %%mm1                \n\t" /* GBGBGBGB 2 */\
805
                        "punpckhbw %%mm7, %%mm6                \n\t" /* 0R0R0R0R 2 */\
806
                        "movq %%mm2, %%mm0                \n\t" /* GBGBGBGB 0 */\
807
                        "movq %%mm1, %%mm3                \n\t" /* GBGBGBGB 2 */\
808
                        "punpcklwd %%mm5, %%mm0                \n\t" /* 0RGB0RGB 0 */\
809
                        "punpckhwd %%mm5, %%mm2                \n\t" /* 0RGB0RGB 1 */\
810
                        "punpcklwd %%mm6, %%mm1                \n\t" /* 0RGB0RGB 2 */\
811
                        "punpckhwd %%mm6, %%mm3                \n\t" /* 0RGB0RGB 3 */\
812
\
813
                        "movq %%mm0, %%mm4                \n\t" /* 0RGB0RGB 0 */\
814
                        "movq %%mm2, %%mm6                \n\t" /* 0RGB0RGB 1 */\
815
                        "movq %%mm1, %%mm5                \n\t" /* 0RGB0RGB 2 */\
816
                        "movq %%mm3, %%mm7                \n\t" /* 0RGB0RGB 3 */\
817
\
818
                        "psllq $40, %%mm0                \n\t" /* RGB00000 0 */\
819
                        "psllq $40, %%mm2                \n\t" /* RGB00000 1 */\
820
                        "psllq $40, %%mm1                \n\t" /* RGB00000 2 */\
821
                        "psllq $40, %%mm3                \n\t" /* RGB00000 3 */\
822
\
823
                        "punpckhdq %%mm4, %%mm0                \n\t" /* 0RGBRGB0 0 */\
824
                        "punpckhdq %%mm6, %%mm2                \n\t" /* 0RGBRGB0 1 */\
825
                        "punpckhdq %%mm5, %%mm1                \n\t" /* 0RGBRGB0 2 */\
826
                        "punpckhdq %%mm7, %%mm3                \n\t" /* 0RGBRGB0 3 */\
827
\
828
                        "psrlq $8, %%mm0                \n\t" /* 00RGBRGB 0 */\
829
                        "movq %%mm2, %%mm6                \n\t" /* 0RGBRGB0 1 */\
830
                        "psllq $40, %%mm2                \n\t" /* GB000000 1 */\
831
                        "por %%mm2, %%mm0                \n\t" /* GBRGBRGB 0 */\
832
                        MOVNTQ(%%mm0, (dst))\
833
\
834
                        "psrlq $24, %%mm6                \n\t" /* 0000RGBR 1 */\
835
                        "movq %%mm1, %%mm5                \n\t" /* 0RGBRGB0 2 */\
836
                        "psllq $24, %%mm1                \n\t" /* BRGB0000 2 */\
837
                        "por %%mm1, %%mm6                \n\t" /* BRGBRGBR 1 */\
838
                        MOVNTQ(%%mm6, 8(dst))\
839
\
840
                        "psrlq $40, %%mm5                \n\t" /* 000000RG 2 */\
841
                        "psllq $8, %%mm3                \n\t" /* RGBRGB00 3 */\
842
                        "por %%mm3, %%mm5                \n\t" /* RGBRGBRG 2 */\
843
                        MOVNTQ(%%mm5, 16(dst))\
844
\
845
                        "add $24, "#dst"                \n\t"\
846
\
847
                        "add $8, "#index"                        \n\t"\
848
                        "cmp "#dstw", "#index"                        \n\t"\
849
                        " jb 1b                                \n\t"
850

    
851
#define WRITEBGR24MMX2(dst, dstw, index) \
852
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
853
                        "movq "MANGLE(M24A)", %%mm0        \n\t"\
854
                        "movq "MANGLE(M24C)", %%mm7        \n\t"\
855
                        "pshufw $0x50, %%mm2, %%mm1        \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
856
                        "pshufw $0x50, %%mm4, %%mm3        \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
857
                        "pshufw $0x00, %%mm5, %%mm6        \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
858
\
859
                        "pand %%mm0, %%mm1                \n\t" /*    B2        B1       B0 */\
860
                        "pand %%mm0, %%mm3                \n\t" /*    G2        G1       G0 */\
861
                        "pand %%mm7, %%mm6                \n\t" /*       R1        R0       */\
862
\
863
                        "psllq $8, %%mm3                \n\t" /* G2        G1       G0    */\
864
                        "por %%mm1, %%mm6                \n\t"\
865
                        "por %%mm3, %%mm6                \n\t"\
866
                        MOVNTQ(%%mm6, (dst))\
867
\
868
                        "psrlq $8, %%mm4                \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
869
                        "pshufw $0xA5, %%mm2, %%mm1        \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
870
                        "pshufw $0x55, %%mm4, %%mm3        \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
871
                        "pshufw $0xA5, %%mm5, %%mm6        \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
872
\
873
                        "pand "MANGLE(M24B)", %%mm1        \n\t" /* B5       B4        B3    */\
874
                        "pand %%mm7, %%mm3                \n\t" /*       G4        G3       */\
875
                        "pand %%mm0, %%mm6                \n\t" /*    R4        R3       R2 */\
876
\
877
                        "por %%mm1, %%mm3                \n\t" /* B5    G4 B4     G3 B3    */\
878
                        "por %%mm3, %%mm6                \n\t"\
879
                        MOVNTQ(%%mm6, 8(dst))\
880
\
881
                        "pshufw $0xFF, %%mm2, %%mm1        \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
882
                        "pshufw $0xFA, %%mm4, %%mm3        \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
883
                        "pshufw $0xFA, %%mm5, %%mm6        \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
884
\
885
                        "pand %%mm7, %%mm1                \n\t" /*       B7        B6       */\
886
                        "pand %%mm0, %%mm3                \n\t" /*    G7        G6       G5 */\
887
                        "pand "MANGLE(M24B)", %%mm6        \n\t" /* R7       R6        R5    */\
888
\
889
                        "por %%mm1, %%mm3                \n\t"\
890
                        "por %%mm3, %%mm6                \n\t"\
891
                        MOVNTQ(%%mm6, 16(dst))\
892
\
893
                        "add $24, "#dst"                \n\t"\
894
\
895
                        "add $8, "#index"                \n\t"\
896
                        "cmp "#dstw", "#index"                \n\t"\
897
                        " jb 1b                                \n\t"
898

    
899
#ifdef HAVE_MMX2
900
#undef WRITEBGR24
901
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
902
#else
903
#undef WRITEBGR24
904
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
905
#endif
906

    
907
#define REAL_WRITEYUY2(dst, dstw, index) \
908
                        "packuswb %%mm3, %%mm3                \n\t"\
909
                        "packuswb %%mm4, %%mm4                \n\t"\
910
                        "packuswb %%mm7, %%mm1                \n\t"\
911
                        "punpcklbw %%mm4, %%mm3                \n\t"\
912
                        "movq %%mm1, %%mm7                \n\t"\
913
                        "punpcklbw %%mm3, %%mm1                \n\t"\
914
                        "punpckhbw %%mm3, %%mm7                \n\t"\
915
\
916
                        MOVNTQ(%%mm1, (dst, index, 2))\
917
                        MOVNTQ(%%mm7, 8(dst, index, 2))\
918
\
919
                        "add $8, "#index"                \n\t"\
920
                        "cmp "#dstw", "#index"                \n\t"\
921
                        " jb 1b                                \n\t"
922
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
923

    
924

    
925
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
926
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
927
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
928
{
929
#ifdef HAVE_MMX
930
        if(c->flags & SWS_ACCURATE_RND){
931
                if(uDest){
932
                        YSCALEYUV2YV12X_ACCURATE(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
933
                        YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
934
                }
935

    
936
                YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
937
        }else{
938
                if(uDest){
939
                        YSCALEYUV2YV12X(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
940
                        YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
941
                }
942

    
943
                YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
944
        }
945
#else
946
#ifdef HAVE_ALTIVEC
947
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
948
                      chrFilter, chrSrc, chrFilterSize,
949
                      dest, uDest, vDest, dstW, chrDstW);
950
#else //HAVE_ALTIVEC
951
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
952
            chrFilter, chrSrc, chrFilterSize,
953
            dest, uDest, vDest, dstW, chrDstW);
954
#endif //!HAVE_ALTIVEC
955
#endif
956
}
957

    
958
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
959
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
960
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
961
{
962
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
963
             chrFilter, chrSrc, chrFilterSize,
964
             dest, uDest, dstW, chrDstW, dstFormat);
965
}
966

    
967
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
968
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
969
{
970
#ifdef HAVE_MMX
971
        if(uDest != NULL)
972
        {
973
                asm volatile(
974
                                YSCALEYUV2YV121
975
                                :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
976
                                "g" (-chrDstW)
977
                                : "%"REG_a
978
                        );
979

    
980
                asm volatile(
981
                                YSCALEYUV2YV121
982
                                :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
983
                                "g" (-chrDstW)
984
                                : "%"REG_a
985
                        );
986
        }
987

    
988
        asm volatile(
989
                YSCALEYUV2YV121
990
                :: "r" (lumSrc + dstW), "r" (dest + dstW),
991
                "g" (-dstW)
992
                : "%"REG_a
993
        );
994
#else
995
        int i;
996
        for(i=0; i<dstW; i++)
997
        {
998
                int val= lumSrc[i]>>7;
999
                
1000
                if(val&256){
1001
                        if(val<0) val=0;
1002
                        else      val=255;
1003
                }
1004

    
1005
                dest[i]= val;
1006
        }
1007

    
1008
        if(uDest != NULL)
1009
                for(i=0; i<chrDstW; i++)
1010
                {
1011
                        int u=chrSrc[i]>>7;
1012
                        int v=chrSrc[i + 2048]>>7;
1013

    
1014
                        if((u|v)&256){
1015
                                if(u<0)         u=0;
1016
                                else if (u>255) u=255;
1017
                                if(v<0)         v=0;
1018
                                else if (v>255) v=255;
1019
                        }
1020

    
1021
                        uDest[i]= u;
1022
                        vDest[i]= v;
1023
                }
1024
#endif
1025
}
1026

    
1027

    
1028
/**
1029
 * vertical scale YV12 to RGB
1030
 */
1031
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1032
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1033
                            uint8_t *dest, long dstW, long dstY)
1034
{
1035
        long dummy=0;
1036
#ifdef HAVE_MMX
1037
    if(c->flags & SWS_ACCURATE_RND){
1038
                switch(c->dstFormat){
1039
                case IMGFMT_BGR32:
1040
                                YSCALEYUV2PACKEDX_ACCURATE
1041
                                YSCALEYUV2RGBX
1042
                                WRITEBGR32(%4, %5, %%REGa)
1043

    
1044
                                YSCALEYUV2PACKEDX_END
1045
                        return;
1046
                case IMGFMT_BGR24:
1047
                                YSCALEYUV2PACKEDX_ACCURATE
1048
                                YSCALEYUV2RGBX
1049
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
1050
                                "add %4, %%"REG_b"                        \n\t"
1051
                                WRITEBGR24(%%REGb, %5, %%REGa)
1052

    
1053

    
1054
                        :: "r" (&c->redDither), 
1055
                           "m" (dummy), "m" (dummy), "m" (dummy),
1056
                           "r" (dest), "m" (dstW)
1057
                        : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
1058
                        );
1059
                        return;
1060
                case IMGFMT_BGR15:
1061
                                YSCALEYUV2PACKEDX_ACCURATE
1062
                                YSCALEYUV2RGBX
1063
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1064
#ifdef DITHER1XBPP
1065
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1066
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1067
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1068
#endif
1069

    
1070
                                WRITEBGR15(%4, %5, %%REGa)
1071
                                YSCALEYUV2PACKEDX_END
1072
                        return;
1073
                case IMGFMT_BGR16:
1074
                                YSCALEYUV2PACKEDX_ACCURATE
1075
                                YSCALEYUV2RGBX
1076
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077
#ifdef DITHER1XBPP
1078
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1079
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1080
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1081
#endif
1082

    
1083
                                WRITEBGR16(%4, %5, %%REGa)
1084
                                YSCALEYUV2PACKEDX_END
1085
                        return;
1086
                case IMGFMT_YUY2:
1087
                                YSCALEYUV2PACKEDX_ACCURATE
1088
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089

    
1090
                                "psraw $3, %%mm3                \n\t"
1091
                                "psraw $3, %%mm4                \n\t"
1092
                                "psraw $3, %%mm1                \n\t"
1093
                                "psraw $3, %%mm7                \n\t"
1094
                                WRITEYUY2(%4, %5, %%REGa)
1095
                                YSCALEYUV2PACKEDX_END
1096
                        return;
1097
                }
1098
    }else{
1099
        switch(c->dstFormat)
1100
        {
1101
        case IMGFMT_BGR32:
1102
                                YSCALEYUV2PACKEDX
1103
                                YSCALEYUV2RGBX
1104
                                WRITEBGR32(%4, %5, %%REGa)
1105
                                YSCALEYUV2PACKEDX_END
1106
                return;
1107
        case IMGFMT_BGR24:
1108
                                YSCALEYUV2PACKEDX
1109
                                YSCALEYUV2RGBX
1110
                                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t" //FIXME optimize
1111
                                "add %4, %%"REG_b"                        \n\t"
1112
                                WRITEBGR24(%%REGb, %5, %%REGa)
1113

    
1114
                        :: "r" (&c->redDither), 
1115
                           "m" (dummy), "m" (dummy), "m" (dummy),
1116
                           "r" (dest), "m" (dstW)
1117
                        : "%"REG_a, "%"REG_b, "%"REG_d, "%"REG_S //FIXME ebx
1118
                        );
1119
                return;
1120
        case IMGFMT_BGR15:
1121
                                YSCALEYUV2PACKEDX
1122
                                YSCALEYUV2RGBX
1123
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1124
#ifdef DITHER1XBPP
1125
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1126
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1127
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1128
#endif
1129

    
1130
                                WRITEBGR15(%4, %5, %%REGa)
1131
                                YSCALEYUV2PACKEDX_END
1132
                return;
1133
        case IMGFMT_BGR16:
1134
                                YSCALEYUV2PACKEDX
1135
                                YSCALEYUV2RGBX
1136
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1137
#ifdef DITHER1XBPP
1138
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1139
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1140
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1141
#endif
1142

    
1143
                                WRITEBGR16(%4, %5, %%REGa)
1144
                                YSCALEYUV2PACKEDX_END
1145
                return;
1146
        case IMGFMT_YUY2:
1147
                                YSCALEYUV2PACKEDX
1148
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149

    
1150
                                "psraw $3, %%mm3                \n\t"
1151
                                "psraw $3, %%mm4                \n\t"
1152
                                "psraw $3, %%mm1                \n\t"
1153
                                "psraw $3, %%mm7                \n\t"
1154
                                WRITEYUY2(%4, %5, %%REGa)
1155
                                YSCALEYUV2PACKEDX_END
1156
                return;
1157
        }
1158
    }
1159
#endif
1160
#ifdef HAVE_ALTIVEC
1161
                /* The following list of supported dstFormat values should
1162
                   match what's found in the body of altivec_yuv2packedX() */
1163
                if(c->dstFormat==IMGFMT_ABGR  || c->dstFormat==IMGFMT_BGRA  ||
1164
                   c->dstFormat==IMGFMT_BGR24 || c->dstFormat==IMGFMT_RGB24 ||
1165
                   c->dstFormat==IMGFMT_RGBA  || c->dstFormat==IMGFMT_ARGB)
1166
                        altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1167
                                    chrFilter, chrSrc, chrFilterSize,
1168
                                    dest, dstW, dstY);
1169
                else
1170
#endif
1171
                        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1172
                                    chrFilter, chrSrc, chrFilterSize,
1173
                                    dest, dstW, dstY);
1174
}
1175

    
1176
/**
1177
 * vertical bilinear scale YV12 to RGB
1178
 */
1179
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1180
                            uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1181
{
1182
        int yalpha1=yalpha^4095;
1183
        int uvalpha1=uvalpha^4095;
1184
        int i;
1185

    
1186
#if 0 //isn't used
1187
        if(flags&SWS_FULL_CHR_H_INT)
1188
        {
1189
                switch(dstFormat)
1190
                {
1191
#ifdef HAVE_MMX
1192
                case IMGFMT_BGR32:
1193
                        asm volatile(
1194

1195

1196
FULL_YSCALEYUV2RGB
1197
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
1198
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
1199

1200
                        "movq %%mm3, %%mm1                \n\t"
1201
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
1202
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1203

1204
                        MOVNTQ(%%mm3, (%4, %%REGa, 4))
1205
                        MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1206

1207
                        "add $4, %%"REG_a"                \n\t"
1208
                        "cmp %5, %%"REG_a"                \n\t"
1209
                        " jb 1b                                \n\t"
1210

1211

1212
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1213
                        "m" (yalpha1), "m" (uvalpha1)
1214
                        : "%"REG_a
1215
                        );
1216
                        break;
1217
                case IMGFMT_BGR24:
1218
                        asm volatile(
1219

1220
FULL_YSCALEYUV2RGB
1221

1222
                                                                // lsb ... msb
1223
                        "punpcklbw %%mm1, %%mm3                \n\t" // BGBGBGBG
1224
                        "punpcklbw %%mm7, %%mm0                \n\t" // R0R0R0R0
1225

1226
                        "movq %%mm3, %%mm1                \n\t"
1227
                        "punpcklwd %%mm0, %%mm3                \n\t" // BGR0BGR0
1228
                        "punpckhwd %%mm0, %%mm1                \n\t" // BGR0BGR0
1229

1230
                        "movq %%mm3, %%mm2                \n\t" // BGR0BGR0
1231
                        "psrlq $8, %%mm3                \n\t" // GR0BGR00
1232
                        "pand "MANGLE(bm00000111)", %%mm2\n\t" // BGR00000
1233
                        "pand "MANGLE(bm11111000)", %%mm3\n\t" // 000BGR00
1234
                        "por %%mm2, %%mm3                \n\t" // BGRBGR00
1235
                        "movq %%mm1, %%mm2                \n\t"
1236
                        "psllq $48, %%mm1                \n\t" // 000000BG
1237
                        "por %%mm1, %%mm3                \n\t" // BGRBGRBG
1238

1239
                        "movq %%mm2, %%mm1                \n\t" // BGR0BGR0
1240
                        "psrld $16, %%mm2                \n\t" // R000R000
1241
                        "psrlq $24, %%mm1                \n\t" // 0BGR0000
1242
                        "por %%mm2, %%mm1                \n\t" // RBGRR000
1243

1244
                        "mov %4, %%"REG_b"                \n\t"
1245
                        "add %%"REG_a", %%"REG_b"        \n\t"
1246

1247
#ifdef HAVE_MMX2
1248
                        //FIXME Alignment
1249
                        "movntq %%mm3, (%%"REG_b", %%"REG_a", 2)\n\t"
1250
                        "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)\n\t"
1251
#else
1252
                        "movd %%mm3, (%%"REG_b", %%"REG_a", 2)        \n\t"
1253
                        "psrlq $32, %%mm3                \n\t"
1254
                        "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)        \n\t"
1255
                        "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)        \n\t"
1256
#endif
1257
                        "add $4, %%"REG_a"                \n\t"
1258
                        "cmp %5, %%"REG_a"                \n\t"
1259
                        " jb 1b                                \n\t"
1260

    
1261
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1262
                        "m" (yalpha1), "m" (uvalpha1)
1263
                        : "%"REG_a, "%"REG_b
1264
                        );
1265
                        break;
1266
                case IMGFMT_BGR15:
1267
                        asm volatile(
1268

    
1269
FULL_YSCALEYUV2RGB
1270
#ifdef DITHER1XBPP
1271
                        "paddusb "MANGLE(g5Dither)", %%mm1\n\t"
1272
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1273
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1274
#endif
1275
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1276
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1277
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1278

    
1279
                        "psrlw $3, %%mm3                \n\t"
1280
                        "psllw $2, %%mm1                \n\t"
1281
                        "psllw $7, %%mm0                \n\t"
1282
                        "pand "MANGLE(g15Mask)", %%mm1        \n\t"
1283
                        "pand "MANGLE(r15Mask)", %%mm0        \n\t"
1284

    
1285
                        "por %%mm3, %%mm1                \n\t"
1286
                        "por %%mm1, %%mm0                \n\t"
1287

    
1288
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1289

    
1290
                        "add $4, %%"REG_a"                \n\t"
1291
                        "cmp %5, %%"REG_a"                \n\t"
1292
                        " jb 1b                                \n\t"
1293

    
1294
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1295
                        "m" (yalpha1), "m" (uvalpha1)
1296
                        : "%"REG_a
1297
                        );
1298
                        break;
1299
                case IMGFMT_BGR16:
1300
                        asm volatile(
1301

    
1302
FULL_YSCALEYUV2RGB
1303
#ifdef DITHER1XBPP
1304
                        "paddusb "MANGLE(g6Dither)", %%mm1\n\t"
1305
                        "paddusb "MANGLE(r5Dither)", %%mm0\n\t"
1306
                        "paddusb "MANGLE(b5Dither)", %%mm3\n\t"
1307
#endif
1308
                        "punpcklbw %%mm7, %%mm1                \n\t" // 0G0G0G0G
1309
                        "punpcklbw %%mm7, %%mm3                \n\t" // 0B0B0B0B
1310
                        "punpcklbw %%mm7, %%mm0                \n\t" // 0R0R0R0R
1311

    
1312
                        "psrlw $3, %%mm3                \n\t"
1313
                        "psllw $3, %%mm1                \n\t"
1314
                        "psllw $8, %%mm0                \n\t"
1315
                        "pand "MANGLE(g16Mask)", %%mm1        \n\t"
1316
                        "pand "MANGLE(r16Mask)", %%mm0        \n\t"
1317

    
1318
                        "por %%mm3, %%mm1                \n\t"
1319
                        "por %%mm1, %%mm0                \n\t"
1320

    
1321
                        MOVNTQ(%%mm0, (%4, %%REGa, 2))
1322

    
1323
                        "add $4, %%"REG_a"                \n\t"
1324
                        "cmp %5, %%"REG_a"                \n\t"
1325
                        " jb 1b                                \n\t"
1326

    
1327
                        :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1328
                        "m" (yalpha1), "m" (uvalpha1)
1329
                        : "%"REG_a
1330
                        );
1331
                break;
1332
#endif
1333
                case IMGFMT_RGB32:
1334
#ifndef HAVE_MMX
1335
                case IMGFMT_BGR32:
1336
#endif
1337
                if(dstFormat==IMGFMT_BGR32)
1338
                {
1339
                        int i;
1340
#ifdef WORDS_BIGENDIAN
1341
                        dest++;
1342
#endif
1343
                        for(i=0;i<dstW;i++){
1344
                                // vertical linear interpolation && yuv2rgb in a single step:
1345
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1346
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1347
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1348
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1349
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1350
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1351
                                dest+= 4;
1352
                        }
1353
                }
1354
                else if(dstFormat==IMGFMT_BGR24)
1355
                {
1356
                        int i;
1357
                        for(i=0;i<dstW;i++){
1358
                                // vertical linear interpolation && yuv2rgb in a single step:
1359
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1360
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1361
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1362
                                dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1363
                                dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1364
                                dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1365
                                dest+= 3;
1366
                        }
1367
                }
1368
                else if(dstFormat==IMGFMT_BGR16)
1369
                {
1370
                        int i;
1371
                        for(i=0;i<dstW;i++){
1372
                                // vertical linear interpolation && yuv2rgb in a single step:
1373
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1374
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1375
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1376

    
1377
                                ((uint16_t*)dest)[i] =
1378
                                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1379
                                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1380
                                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1381
                        }
1382
                }
1383
                else if(dstFormat==IMGFMT_BGR15)
1384
                {
1385
                        int i;
1386
                        for(i=0;i<dstW;i++){
1387
                                // vertical linear interpolation && yuv2rgb in a single step:
1388
                                int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389
                                int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390
                                int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1391

    
1392
                                ((uint16_t*)dest)[i] =
1393
                                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1394
                                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1395
                                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1396
                        }
1397
                }
1398
        }//FULL_UV_IPOL
1399
        else
1400
        {
1401
#endif // if 0
1402
#ifdef HAVE_MMX
1403
        switch(c->dstFormat)
1404
        {
1405
//Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1406
        case IMGFMT_BGR32:
1407
                        asm volatile(
1408
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409
                                "mov %4, %%"REG_b"                        \n\t"
1410
                                "push %%"REG_BP"                        \n\t"
1411
                                YSCALEYUV2RGB(%%REGBP, %5)
1412
                                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1413
                                "pop %%"REG_BP"                         \n\t"
1414
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1415

    
1416
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1417
                        "a" (&c->redDither)
1418
                        );
1419
                        return;
1420
        case IMGFMT_BGR24:
1421
                        asm volatile(
1422
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1423
                                "mov %4, %%"REG_b"                        \n\t"
1424
                                "push %%"REG_BP"                        \n\t"
1425
                                YSCALEYUV2RGB(%%REGBP, %5)
1426
                                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427
                                "pop %%"REG_BP"                         \n\t"
1428
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1429
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1430
                        "a" (&c->redDither)
1431
                        );
1432
                        return;
1433
        case IMGFMT_BGR15:
1434
                        asm volatile(
1435
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1436
                                "mov %4, %%"REG_b"                        \n\t"
1437
                                "push %%"REG_BP"                        \n\t"
1438
                                YSCALEYUV2RGB(%%REGBP, %5)
1439
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1440
#ifdef DITHER1XBPP
1441
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1442
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1443
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1444
#endif
1445

    
1446
                                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1447
                                "pop %%"REG_BP"                         \n\t"
1448
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1449

    
1450
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1451
                        "a" (&c->redDither)
1452
                        );
1453
                        return;
1454
        case IMGFMT_BGR16:
1455
                        asm volatile(
1456
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1457
                                "mov %4, %%"REG_b"                        \n\t"
1458
                                "push %%"REG_BP"                        \n\t"
1459
                                YSCALEYUV2RGB(%%REGBP, %5)
1460
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1461
#ifdef DITHER1XBPP
1462
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1463
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1464
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1465
#endif
1466

    
1467
                                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1468
                                "pop %%"REG_BP"                         \n\t"
1469
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1470
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1471
                        "a" (&c->redDither)
1472
                        );
1473
                        return;
1474
        case IMGFMT_YUY2:
1475
                        asm volatile(
1476
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1477
                                "mov %4, %%"REG_b"                        \n\t"
1478
                                "push %%"REG_BP"                        \n\t"
1479
                                YSCALEYUV2PACKED(%%REGBP, %5)
1480
                                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1481
                                "pop %%"REG_BP"                         \n\t"
1482
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1483
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1484
                        "a" (&c->redDither)
1485
                        );
1486
                        return;
1487
        default: break;
1488
        }
1489
#endif //HAVE_MMX
1490
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1491
}
1492

    
1493
/**
1494
 * YV12 to RGB without scaling or interpolating
1495
 */
1496
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1497
                            uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1498
{
1499
        const int yalpha1=0;
1500
        int i;
1501
        
1502
        uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1503
        const int yalpha= 4096; //FIXME ...
1504

    
1505
        if(flags&SWS_FULL_CHR_H_INT)
1506
        {
1507
                RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1508
                return;
1509
        }
1510

    
1511
#ifdef HAVE_MMX
1512
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1513
        {
1514
                switch(dstFormat)
1515
                {
1516
                case IMGFMT_BGR32:
1517
                        asm volatile(
1518
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1519
                                "mov %4, %%"REG_b"                        \n\t"
1520
                                "push %%"REG_BP"                        \n\t"
1521
                                YSCALEYUV2RGB1(%%REGBP, %5)
1522
                                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1523
                                "pop %%"REG_BP"                         \n\t"
1524
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1525

    
1526
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1527
                        "a" (&c->redDither)
1528
                        );
1529
                        return;
1530
                case IMGFMT_BGR24:
1531
                        asm volatile(
1532
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1533
                                "mov %4, %%"REG_b"                        \n\t"
1534
                                "push %%"REG_BP"                        \n\t"
1535
                                YSCALEYUV2RGB1(%%REGBP, %5)
1536
                                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1537
                                "pop %%"REG_BP"                         \n\t"
1538
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1539

    
1540
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1541
                        "a" (&c->redDither)
1542
                        );
1543
                        return;
1544
                case IMGFMT_BGR15:
1545
                        asm volatile(
1546
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1547
                                "mov %4, %%"REG_b"                        \n\t"
1548
                                "push %%"REG_BP"                        \n\t"
1549
                                YSCALEYUV2RGB1(%%REGBP, %5)
1550
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1551
#ifdef DITHER1XBPP
1552
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1553
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1554
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1555
#endif
1556
                                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1557
                                "pop %%"REG_BP"                         \n\t"
1558
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1559

    
1560
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561
                        "a" (&c->redDither)
1562
                        );
1563
                        return;
1564
                case IMGFMT_BGR16:
1565
                        asm volatile(
1566
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1567
                                "mov %4, %%"REG_b"                        \n\t"
1568
                                "push %%"REG_BP"                        \n\t"
1569
                                YSCALEYUV2RGB1(%%REGBP, %5)
1570
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1571
#ifdef DITHER1XBPP
1572
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1573
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1574
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1575
#endif
1576

    
1577
                                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1578
                                "pop %%"REG_BP"                         \n\t"
1579
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1580

    
1581
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1582
                        "a" (&c->redDither)
1583
                        );
1584
                        return;
1585
                case IMGFMT_YUY2:
1586
                        asm volatile(
1587
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1588
                                "mov %4, %%"REG_b"                        \n\t"
1589
                                "push %%"REG_BP"                        \n\t"
1590
                                YSCALEYUV2PACKED1(%%REGBP, %5)
1591
                                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1592
                                "pop %%"REG_BP"                         \n\t"
1593
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1594

    
1595
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1596
                        "a" (&c->redDither)
1597
                        );
1598
                        return;
1599
                }
1600
        }
1601
        else
1602
        {
1603
                switch(dstFormat)
1604
                {
1605
                case IMGFMT_BGR32:
1606
                        asm volatile(
1607
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1608
                                "mov %4, %%"REG_b"                        \n\t"
1609
                                "push %%"REG_BP"                        \n\t"
1610
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1611
                                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1612
                                "pop %%"REG_BP"                         \n\t"
1613
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1614

    
1615
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1616
                        "a" (&c->redDither)
1617
                        );
1618
                        return;
1619
                case IMGFMT_BGR24:
1620
                        asm volatile(
1621
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1622
                                "mov %4, %%"REG_b"                        \n\t"
1623
                                "push %%"REG_BP"                        \n\t"
1624
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1625
                                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1626
                                "pop %%"REG_BP"                         \n\t"
1627
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1628

    
1629
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1630
                        "a" (&c->redDither)
1631
                        );
1632
                        return;
1633
                case IMGFMT_BGR15:
1634
                        asm volatile(
1635
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1636
                                "mov %4, %%"REG_b"                        \n\t"
1637
                                "push %%"REG_BP"                        \n\t"
1638
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1639
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1640
#ifdef DITHER1XBPP
1641
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1642
                                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1643
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1644
#endif
1645
                                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1646
                                "pop %%"REG_BP"                         \n\t"
1647
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1648

    
1649
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1650
                        "a" (&c->redDither)
1651
                        );
1652
                        return;
1653
                case IMGFMT_BGR16:
1654
                        asm volatile(
1655
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1656
                                "mov %4, %%"REG_b"                        \n\t"
1657
                                "push %%"REG_BP"                        \n\t"
1658
                                YSCALEYUV2RGB1b(%%REGBP, %5)
1659
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1660
#ifdef DITHER1XBPP
1661
                                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1662
                                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1663
                                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1664
#endif
1665

    
1666
                                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1667
                                "pop %%"REG_BP"                         \n\t"
1668
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1669

    
1670
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1671
                        "a" (&c->redDither)
1672
                        );
1673
                        return;
1674
                case IMGFMT_YUY2:
1675
                        asm volatile(
1676
                                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1677
                                "mov %4, %%"REG_b"                        \n\t"
1678
                                "push %%"REG_BP"                        \n\t"
1679
                                YSCALEYUV2PACKED1b(%%REGBP, %5)
1680
                                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1681
                                "pop %%"REG_BP"                         \n\t"
1682
                                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1683

    
1684
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1685
                        "a" (&c->redDither)
1686
                        );
1687
                        return;
1688
                }
1689
        }
1690
#endif
1691
        if( uvalpha < 2048 )
1692
        {
1693
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1694
        }else{
1695
                YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1696
        }
1697
}
1698

    
1699
//FIXME yuy2* can read upto 7 samples to much
1700

    
1701
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1702
{
1703
#ifdef HAVE_MMX
1704
        asm volatile(
1705
                "movq "MANGLE(bm01010101)", %%mm2\n\t"
1706
                "mov %0, %%"REG_a"                \n\t"
1707
                "1:                                \n\t"
1708
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1709
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1710
                "pand %%mm2, %%mm0                \n\t"
1711
                "pand %%mm2, %%mm1                \n\t"
1712
                "packuswb %%mm1, %%mm0                \n\t"
1713
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1714
                "add $8, %%"REG_a"                \n\t"
1715
                " js 1b                                \n\t"
1716
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1717
                : "%"REG_a
1718
        );
1719
#else
1720
        int i;
1721
        for(i=0; i<width; i++)
1722
                dst[i]= src[2*i];
1723
#endif
1724
}
1725

    
1726
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1727
{
1728
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1729
        asm volatile(
1730
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1731
                "mov %0, %%"REG_a"                \n\t"
1732
                "1:                                \n\t"
1733
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1734
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1735
                "movq (%2, %%"REG_a",4), %%mm2        \n\t"
1736
                "movq 8(%2, %%"REG_a",4), %%mm3        \n\t"
1737
                PAVGB(%%mm2, %%mm0)
1738
                PAVGB(%%mm3, %%mm1)
1739
                "psrlw $8, %%mm0                \n\t"
1740
                "psrlw $8, %%mm1                \n\t"
1741
                "packuswb %%mm1, %%mm0                \n\t"
1742
                "movq %%mm0, %%mm1                \n\t"
1743
                "psrlw $8, %%mm0                \n\t"
1744
                "pand %%mm4, %%mm1                \n\t"
1745
                "packuswb %%mm0, %%mm0                \n\t"
1746
                "packuswb %%mm1, %%mm1                \n\t"
1747
                "movd %%mm0, (%4, %%"REG_a")        \n\t"
1748
                "movd %%mm1, (%3, %%"REG_a")        \n\t"
1749
                "add $4, %%"REG_a"                \n\t"
1750
                " js 1b                                \n\t"
1751
                : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1752
                : "%"REG_a
1753
        );
1754
#else
1755
        int i;
1756
        for(i=0; i<width; i++)
1757
        {
1758
                dstU[i]= (src1[4*i + 1] + src2[4*i + 1])>>1;
1759
                dstV[i]= (src1[4*i + 3] + src2[4*i + 3])>>1;
1760
        }
1761
#endif
1762
}
1763

    
1764
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1765
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1766
{
1767
#ifdef HAVE_MMX
1768
        asm volatile(
1769
                "mov %0, %%"REG_a"                \n\t"
1770
                "1:                                \n\t"
1771
                "movq (%1, %%"REG_a",2), %%mm0        \n\t"
1772
                "movq 8(%1, %%"REG_a",2), %%mm1        \n\t"
1773
                "psrlw $8, %%mm0                \n\t"
1774
                "psrlw $8, %%mm1                \n\t"
1775
                "packuswb %%mm1, %%mm0                \n\t"
1776
                "movq %%mm0, (%2, %%"REG_a")        \n\t"
1777
                "add $8, %%"REG_a"                \n\t"
1778
                " js 1b                                \n\t"
1779
                : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1780
                : "%"REG_a
1781
        );
1782
#else
1783
        int i;
1784
        for(i=0; i<width; i++)
1785
                dst[i]= src[2*i+1];
1786
#endif
1787
}
1788

    
1789
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1790
{
1791
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1792
        asm volatile(
1793
                "movq "MANGLE(bm01010101)", %%mm4\n\t"
1794
                "mov %0, %%"REG_a"                \n\t"
1795
                "1:                                \n\t"
1796
                "movq (%1, %%"REG_a",4), %%mm0        \n\t"
1797
                "movq 8(%1, %%"REG_a",4), %%mm1        \n\t"
1798
                "movq (%2, %%"REG_a",4), %%mm2        \n\t"
1799
                "movq 8(%2, %%"REG_a",4), %%mm3        \n\t"
1800
                PAVGB(%%mm2, %%mm0)
1801
                PAVGB(%%mm3, %%mm1)
1802
                "pand %%mm4, %%mm0                \n\t"
1803
                "pand %%mm4, %%mm1                \n\t"
1804
                "packuswb %%mm1, %%mm0                \n\t"
1805
                "movq %%mm0, %%mm1                \n\t"
1806
                "psrlw $8, %%mm0                \n\t"
1807
                "pand %%mm4, %%mm1                \n\t"
1808
                "packuswb %%mm0, %%mm0                \n\t"
1809
                "packuswb %%mm1, %%mm1                \n\t"
1810
                "movd %%mm0, (%4, %%"REG_a")        \n\t"
1811
                "movd %%mm1, (%3, %%"REG_a")        \n\t"
1812
                "add $4, %%"REG_a"                \n\t"
1813
                " js 1b                                \n\t"
1814
                : : "g" (-width), "r" (src1+width*4), "r" (src2+width*4), "r" (dstU+width), "r" (dstV+width)
1815
                : "%"REG_a
1816
        );
1817
#else
1818
        int i;
1819
        for(i=0; i<width; i++)
1820
        {
1821
                dstU[i]= (src1[4*i + 0] + src2[4*i + 0])>>1;
1822
                dstV[i]= (src1[4*i + 2] + src2[4*i + 2])>>1;
1823
        }
1824
#endif
1825
}
1826

    
1827
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1828
{
1829
        int i;
1830
        for(i=0; i<width; i++)
1831
        {
1832
                int b=  ((uint32_t*)src)[i]&0xFF;
1833
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
1834
                int r= (((uint32_t*)src)[i]>>16)&0xFF;
1835

    
1836
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1837
        }
1838
}
1839

    
1840
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1841
{
1842
        int i;
1843
        for(i=0; i<width; i++)
1844
        {
1845
                const int a= ((uint32_t*)src1)[2*i+0];
1846
                const int e= ((uint32_t*)src1)[2*i+1];
1847
                const int c= ((uint32_t*)src2)[2*i+0];
1848
                const int d= ((uint32_t*)src2)[2*i+1];
1849
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
1850
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
1851
                 const int b=  l&0x3FF;
1852
                const int g=  h>>8;
1853
                const int r=  l>>16;
1854

    
1855
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
1856
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
1857
        }
1858
}
1859

    
1860
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1861
{
1862
#ifdef HAVE_MMX
1863
        asm volatile(
1864
                "mov %2, %%"REG_a"                \n\t"
1865
                "movq "MANGLE(bgr2YCoeff)", %%mm6                \n\t"
1866
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1867
                "pxor %%mm7, %%mm7                \n\t"
1868
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
1869
                ASMALIGN(4)
1870
                "1:                                \n\t"
1871
                PREFETCH" 64(%0, %%"REG_b")        \n\t"
1872
                "movd (%0, %%"REG_b"), %%mm0        \n\t"
1873
                "movd 3(%0, %%"REG_b"), %%mm1        \n\t"
1874
                "punpcklbw %%mm7, %%mm0                \n\t"
1875
                "punpcklbw %%mm7, %%mm1                \n\t"
1876
                "movd 6(%0, %%"REG_b"), %%mm2        \n\t"
1877
                "movd 9(%0, %%"REG_b"), %%mm3        \n\t"
1878
                "punpcklbw %%mm7, %%mm2                \n\t"
1879
                "punpcklbw %%mm7, %%mm3                \n\t"
1880
                "pmaddwd %%mm6, %%mm0                \n\t"
1881
                "pmaddwd %%mm6, %%mm1                \n\t"
1882
                "pmaddwd %%mm6, %%mm2                \n\t"
1883
                "pmaddwd %%mm6, %%mm3                \n\t"
1884
#ifndef FAST_BGR2YV12
1885
                "psrad $8, %%mm0                \n\t"
1886
                "psrad $8, %%mm1                \n\t"
1887
                "psrad $8, %%mm2                \n\t"
1888
                "psrad $8, %%mm3                \n\t"
1889
#endif
1890
                "packssdw %%mm1, %%mm0                \n\t"
1891
                "packssdw %%mm3, %%mm2                \n\t"
1892
                "pmaddwd %%mm5, %%mm0                \n\t"
1893
                "pmaddwd %%mm5, %%mm2                \n\t"
1894
                "packssdw %%mm2, %%mm0                \n\t"
1895
                "psraw $7, %%mm0                \n\t"
1896

    
1897
                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
1898
                "movd 15(%0, %%"REG_b"), %%mm1        \n\t"
1899
                "punpcklbw %%mm7, %%mm4                \n\t"
1900
                "punpcklbw %%mm7, %%mm1                \n\t"
1901
                "movd 18(%0, %%"REG_b"), %%mm2        \n\t"
1902
                "movd 21(%0, %%"REG_b"), %%mm3        \n\t"
1903
                "punpcklbw %%mm7, %%mm2                \n\t"
1904
                "punpcklbw %%mm7, %%mm3                \n\t"
1905
                "pmaddwd %%mm6, %%mm4                \n\t"
1906
                "pmaddwd %%mm6, %%mm1                \n\t"
1907
                "pmaddwd %%mm6, %%mm2                \n\t"
1908
                "pmaddwd %%mm6, %%mm3                \n\t"
1909
#ifndef FAST_BGR2YV12
1910
                "psrad $8, %%mm4                \n\t"
1911
                "psrad $8, %%mm1                \n\t"
1912
                "psrad $8, %%mm2                \n\t"
1913
                "psrad $8, %%mm3                \n\t"
1914
#endif
1915
                "packssdw %%mm1, %%mm4                \n\t"
1916
                "packssdw %%mm3, %%mm2                \n\t"
1917
                "pmaddwd %%mm5, %%mm4                \n\t"
1918
                "pmaddwd %%mm5, %%mm2                \n\t"
1919
                "add $24, %%"REG_b"                \n\t"
1920
                "packssdw %%mm2, %%mm4                \n\t"
1921
                "psraw $7, %%mm4                \n\t"
1922

    
1923
                "packuswb %%mm4, %%mm0                \n\t"
1924
                "paddusb "MANGLE(bgr2YOffset)", %%mm0        \n\t"
1925

    
1926
                "movq %%mm0, (%1, %%"REG_a")        \n\t"
1927
                "add $8, %%"REG_a"                \n\t"
1928
                " js 1b                                \n\t"
1929
                : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1930
                : "%"REG_a, "%"REG_b
1931
        );
1932
#else
1933
        int i;
1934
        for(i=0; i<width; i++)
1935
        {
1936
                int b= src[i*3+0];
1937
                int g= src[i*3+1];
1938
                int r= src[i*3+2];
1939

    
1940
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1941
        }
1942
#endif
1943
}
1944

    
1945
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1946
{
1947
#ifdef HAVE_MMX
1948
        asm volatile(
1949
                "mov %4, %%"REG_a"                \n\t"
1950
                "movq "MANGLE(w1111)", %%mm5                \n\t"
1951
                "movq "MANGLE(bgr2UCoeff)", %%mm6                \n\t"
1952
                "pxor %%mm7, %%mm7                \n\t"
1953
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"        \n\t"
1954
                "add %%"REG_b", %%"REG_b"        \n\t"
1955
                ASMALIGN(4)
1956
                "1:                                \n\t"
1957
                PREFETCH" 64(%0, %%"REG_b")        \n\t"
1958
                PREFETCH" 64(%1, %%"REG_b")        \n\t"
1959
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1960
                "movq (%0, %%"REG_b"), %%mm0        \n\t"
1961
                "movq (%1, %%"REG_b"), %%mm1        \n\t"
1962
                "movq 6(%0, %%"REG_b"), %%mm2        \n\t"
1963
                "movq 6(%1, %%"REG_b"), %%mm3        \n\t"
1964
                PAVGB(%%mm1, %%mm0)
1965
                PAVGB(%%mm3, %%mm2)
1966
                "movq %%mm0, %%mm1                \n\t"
1967
                "movq %%mm2, %%mm3                \n\t"
1968
                "psrlq $24, %%mm0                \n\t"
1969
                "psrlq $24, %%mm2                \n\t"
1970
                PAVGB(%%mm1, %%mm0)
1971
                PAVGB(%%mm3, %%mm2)
1972
                "punpcklbw %%mm7, %%mm0                \n\t"
1973
                "punpcklbw %%mm7, %%mm2                \n\t"
1974
#else
1975
                "movd (%0, %%"REG_b"), %%mm0        \n\t"
1976
                "movd (%1, %%"REG_b"), %%mm1        \n\t"
1977
                "movd 3(%0, %%"REG_b"), %%mm2        \n\t"
1978
                "movd 3(%1, %%"REG_b"), %%mm3        \n\t"
1979
                "punpcklbw %%mm7, %%mm0                \n\t"
1980
                "punpcklbw %%mm7, %%mm1                \n\t"
1981
                "punpcklbw %%mm7, %%mm2                \n\t"
1982
                "punpcklbw %%mm7, %%mm3                \n\t"
1983
                "paddw %%mm1, %%mm0                \n\t"
1984
                "paddw %%mm3, %%mm2                \n\t"
1985
                "paddw %%mm2, %%mm0                \n\t"
1986
                "movd 6(%0, %%"REG_b"), %%mm4        \n\t"
1987
                "movd 6(%1, %%"REG_b"), %%mm1        \n\t"
1988
                "movd 9(%0, %%"REG_b"), %%mm2        \n\t"
1989
                "movd 9(%1, %%"REG_b"), %%mm3        \n\t"
1990
                "punpcklbw %%mm7, %%mm4                \n\t"
1991
                "punpcklbw %%mm7, %%mm1                \n\t"
1992
                "punpcklbw %%mm7, %%mm2                \n\t"
1993
                "punpcklbw %%mm7, %%mm3                \n\t"
1994
                "paddw %%mm1, %%mm4                \n\t"
1995
                "paddw %%mm3, %%mm2                \n\t"
1996
                "paddw %%mm4, %%mm2                \n\t"
1997
                "psrlw $2, %%mm0                \n\t"
1998
                "psrlw $2, %%mm2                \n\t"
1999
#endif
2000
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2001
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2002
                
2003
                "pmaddwd %%mm0, %%mm1                \n\t"
2004
                "pmaddwd %%mm2, %%mm3                \n\t"
2005
                "pmaddwd %%mm6, %%mm0                \n\t"
2006
                "pmaddwd %%mm6, %%mm2                \n\t"
2007
#ifndef FAST_BGR2YV12
2008
                "psrad $8, %%mm0                \n\t"
2009
                "psrad $8, %%mm1                \n\t"
2010
                "psrad $8, %%mm2                \n\t"
2011
                "psrad $8, %%mm3                \n\t"
2012
#endif
2013
                "packssdw %%mm2, %%mm0                \n\t"
2014
                "packssdw %%mm3, %%mm1                \n\t"
2015
                "pmaddwd %%mm5, %%mm0                \n\t"
2016
                "pmaddwd %%mm5, %%mm1                \n\t"
2017
                "packssdw %%mm1, %%mm0                \n\t" // V1 V0 U1 U0
2018
                "psraw $7, %%mm0                \n\t"
2019

    
2020
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2021
                "movq 12(%0, %%"REG_b"), %%mm4        \n\t"
2022
                "movq 12(%1, %%"REG_b"), %%mm1        \n\t"
2023
                "movq 18(%0, %%"REG_b"), %%mm2        \n\t"
2024
                "movq 18(%1, %%"REG_b"), %%mm3        \n\t"
2025
                PAVGB(%%mm1, %%mm4)
2026
                PAVGB(%%mm3, %%mm2)
2027
                "movq %%mm4, %%mm1                \n\t"
2028
                "movq %%mm2, %%mm3                \n\t"
2029
                "psrlq $24, %%mm4                \n\t"
2030
                "psrlq $24, %%mm2                \n\t"
2031
                PAVGB(%%mm1, %%mm4)
2032
                PAVGB(%%mm3, %%mm2)
2033
                "punpcklbw %%mm7, %%mm4                \n\t"
2034
                "punpcklbw %%mm7, %%mm2                \n\t"
2035
#else
2036
                "movd 12(%0, %%"REG_b"), %%mm4        \n\t"
2037
                "movd 12(%1, %%"REG_b"), %%mm1        \n\t"
2038
                "movd 15(%0, %%"REG_b"), %%mm2        \n\t"
2039
                "movd 15(%1, %%"REG_b"), %%mm3        \n\t"
2040
                "punpcklbw %%mm7, %%mm4                \n\t"
2041
                "punpcklbw %%mm7, %%mm1                \n\t"
2042
                "punpcklbw %%mm7, %%mm2                \n\t"
2043
                "punpcklbw %%mm7, %%mm3                \n\t"
2044
                "paddw %%mm1, %%mm4                \n\t"
2045
                "paddw %%mm3, %%mm2                \n\t"
2046
                "paddw %%mm2, %%mm4                \n\t"
2047
                "movd 18(%0, %%"REG_b"), %%mm5        \n\t"
2048
                "movd 18(%1, %%"REG_b"), %%mm1        \n\t"
2049
                "movd 21(%0, %%"REG_b"), %%mm2        \n\t"
2050
                "movd 21(%1, %%"REG_b"), %%mm3        \n\t"
2051
                "punpcklbw %%mm7, %%mm5                \n\t"
2052
                "punpcklbw %%mm7, %%mm1                \n\t"
2053
                "punpcklbw %%mm7, %%mm2                \n\t"
2054
                "punpcklbw %%mm7, %%mm3                \n\t"
2055
                "paddw %%mm1, %%mm5                \n\t"
2056
                "paddw %%mm3, %%mm2                \n\t"
2057
                "paddw %%mm5, %%mm2                \n\t"
2058
                "movq "MANGLE(w1111)", %%mm5                \n\t"
2059
                "psrlw $2, %%mm4                \n\t"
2060
                "psrlw $2, %%mm2                \n\t"
2061
#endif
2062
                "movq "MANGLE(bgr2VCoeff)", %%mm1                \n\t"
2063
                "movq "MANGLE(bgr2VCoeff)", %%mm3                \n\t"
2064
                
2065
                "pmaddwd %%mm4, %%mm1                \n\t"
2066
                "pmaddwd %%mm2, %%mm3                \n\t"
2067
                "pmaddwd %%mm6, %%mm4                \n\t"
2068
                "pmaddwd %%mm6, %%mm2                \n\t"
2069
#ifndef FAST_BGR2YV12
2070
                "psrad $8, %%mm4                \n\t"
2071
                "psrad $8, %%mm1                \n\t"
2072
                "psrad $8, %%mm2                \n\t"
2073
                "psrad $8, %%mm3                \n\t"
2074
#endif
2075
                "packssdw %%mm2, %%mm4                \n\t"
2076
                "packssdw %%mm3, %%mm1                \n\t"
2077
                "pmaddwd %%mm5, %%mm4                \n\t"
2078
                "pmaddwd %%mm5, %%mm1                \n\t"
2079
                "add $24, %%"REG_b"                \n\t"
2080
                "packssdw %%mm1, %%mm4                \n\t" // V3 V2 U3 U2
2081
                "psraw $7, %%mm4                \n\t"
2082
                
2083
                "movq %%mm0, %%mm1                \n\t"
2084
                "punpckldq %%mm4, %%mm0                \n\t"
2085
                "punpckhdq %%mm4, %%mm1                \n\t"
2086
                "packsswb %%mm1, %%mm0                \n\t"
2087
                "paddb "MANGLE(bgr2UVOffset)", %%mm0        \n\t"
2088

    
2089
                "movd %%mm0, (%2, %%"REG_a")        \n\t"
2090
                "punpckhdq %%mm0, %%mm0                \n\t"
2091
                "movd %%mm0, (%3, %%"REG_a")        \n\t"
2092
                "add $4, %%"REG_a"                \n\t"
2093
                " js 1b                                \n\t"
2094
                : : "r" (src1+width*6), "r" (src2+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2095
                : "%"REG_a, "%"REG_b
2096
        );
2097
#else
2098
        int i;
2099
        for(i=0; i<width; i++)
2100
        {
2101
                int b= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2102
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2103
                int r= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2104

    
2105
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2106
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2107
        }
2108
#endif
2109
}
2110

    
2111
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2112
{
2113
        int i;
2114
        for(i=0; i<width; i++)
2115
        {
2116
                int d= ((uint16_t*)src)[i];
2117
                int b= d&0x1F;
2118
                int g= (d>>5)&0x3F;
2119
                int r= (d>>11)&0x1F;
2120

    
2121
                dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2122
        }
2123
}
2124

    
2125
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2126
{
2127
        int i;
2128
        for(i=0; i<width; i++)
2129
        {
2130
                int d0= ((uint32_t*)src1)[i];
2131
                int d1= ((uint32_t*)src2)[i];
2132
                
2133
                int dl= (d0&0x07E0F81F) + (d1&0x07E0F81F);
2134
                int dh= ((d0>>5)&0x07C0F83F) + ((d1>>5)&0x07C0F83F);
2135

    
2136
                int dh2= (dh>>11) + (dh<<21);
2137
                int d= dh2 + dl;
2138

    
2139
                int b= d&0x7F;
2140
                int r= (d>>11)&0x7F;
2141
                int g= d>>21;
2142
                dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2143
                dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+2-2)) + 128;
2144
        }
2145
}
2146

    
2147
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2148
{
2149
        int i;
2150
        for(i=0; i<width; i++)
2151
        {
2152
                int d= ((uint16_t*)src)[i];
2153
                int b= d&0x1F;
2154
                int g= (d>>5)&0x1F;
2155
                int r= (d>>10)&0x1F;
2156

    
2157
                dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2158
        }
2159
}
2160

    
2161
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2162
{
2163
        int i;
2164
        for(i=0; i<width; i++)
2165
        {
2166
                int d0= ((uint32_t*)src1)[i];
2167
                int d1= ((uint32_t*)src2)[i];
2168
                
2169
                int dl= (d0&0x03E07C1F) + (d1&0x03E07C1F);
2170
                int dh= ((d0>>5)&0x03E0F81F) + ((d1>>5)&0x03E0F81F);
2171

    
2172
                int dh2= (dh>>11) + (dh<<21);
2173
                int d= dh2 + dl;
2174

    
2175
                int b= d&0x7F;
2176
                int r= (d>>10)&0x7F;
2177
                int g= d>>21;
2178
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2179
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2-3)) + 128;
2180
        }
2181
}
2182

    
2183

    
2184
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2185
{
2186
        int i;
2187
        for(i=0; i<width; i++)
2188
        {
2189
                int r=  ((uint32_t*)src)[i]&0xFF;
2190
                int g= (((uint32_t*)src)[i]>>8)&0xFF;
2191
                int b= (((uint32_t*)src)[i]>>16)&0xFF;
2192

    
2193
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2194
        }
2195
}
2196

    
2197
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2198
{
2199
        int i;
2200
        for(i=0; i<width; i++)
2201
        {
2202
                const int a= ((uint32_t*)src1)[2*i+0];
2203
                const int e= ((uint32_t*)src1)[2*i+1];
2204
                const int c= ((uint32_t*)src2)[2*i+0];
2205
                const int d= ((uint32_t*)src2)[2*i+1];
2206
                const int l= (a&0xFF00FF) + (e&0xFF00FF) + (c&0xFF00FF) + (d&0xFF00FF);
2207
                const int h= (a&0x00FF00) + (e&0x00FF00) + (c&0x00FF00) + (d&0x00FF00);
2208
                 const int r=  l&0x3FF;
2209
                const int g=  h>>8;
2210
                const int b=  l>>16;
2211

    
2212
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2213
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2214
        }
2215
}
2216

    
2217
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2218
{
2219
        int i;
2220
        for(i=0; i<width; i++)
2221
        {
2222
                int r= src[i*3+0];
2223
                int g= src[i*3+1];
2224
                int b= src[i*3+2];
2225

    
2226
                dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2227
        }
2228
}
2229

    
2230
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2231
{
2232
        int i;
2233
        for(i=0; i<width; i++)
2234
        {
2235
                int r= src1[6*i + 0] + src1[6*i + 3] + src2[6*i + 0] + src2[6*i + 3];
2236
                int g= src1[6*i + 1] + src1[6*i + 4] + src2[6*i + 1] + src2[6*i + 4];
2237
                int b= src1[6*i + 2] + src1[6*i + 5] + src2[6*i + 2] + src2[6*i + 5];
2238

    
2239
                dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+2)) + 128;
2240
                dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+2)) + 128;
2241
        }
2242
}
2243

    
2244

    
2245
// Bilinear / Bicubic scaling
2246
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2247
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2248
{
2249
#ifdef HAVE_MMX
2250
        assert(filterSize % 4 == 0 && filterSize>0);
2251
        if(filterSize==4) // allways true for upscaling, sometimes for down too
2252
        {
2253
                long counter= -2*dstW;
2254
                filter-= counter*2;
2255
                filterPos-= counter/2;
2256
                dst-= counter/2;
2257
                asm volatile(
2258
                        "pxor %%mm7, %%mm7                \n\t"
2259
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2260
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2261
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2262
                        ASMALIGN(4)
2263
                        "1:                                \n\t"
2264
                        "movzwl (%2, %%"REG_BP"), %%eax        \n\t"
2265
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2266
                        "movq (%1, %%"REG_BP", 4), %%mm1\n\t"
2267
                        "movq 8(%1, %%"REG_BP", 4), %%mm3\n\t"
2268
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2269
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2270
                        "punpcklbw %%mm7, %%mm0                \n\t"
2271
                        "punpcklbw %%mm7, %%mm2                \n\t"
2272
                        "pmaddwd %%mm1, %%mm0                \n\t"
2273
                        "pmaddwd %%mm2, %%mm3                \n\t"
2274
                        "psrad $8, %%mm0                \n\t"
2275
                        "psrad $8, %%mm3                \n\t"
2276
                        "packssdw %%mm3, %%mm0                \n\t"
2277
                        "pmaddwd %%mm6, %%mm0                \n\t"
2278
                        "packssdw %%mm0, %%mm0                \n\t"
2279
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2280
                        "add $4, %%"REG_BP"                \n\t"
2281
                        " jnc 1b                        \n\t"
2282

    
2283
                        "pop %%"REG_BP"                        \n\t"
2284
                        : "+a" (counter)
2285
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2286
                        : "%"REG_b
2287
                );
2288
        }
2289
        else if(filterSize==8)
2290
        {
2291
                long counter= -2*dstW;
2292
                filter-= counter*4;
2293
                filterPos-= counter/2;
2294
                dst-= counter/2;
2295
                asm volatile(
2296
                        "pxor %%mm7, %%mm7                \n\t"
2297
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2298
                        "push %%"REG_BP"                \n\t" // we use 7 regs here ...
2299
                        "mov %%"REG_a", %%"REG_BP"        \n\t"
2300
                        ASMALIGN(4)
2301
                        "1:                                \n\t"
2302
                        "movzwl (%2, %%"REG_BP"), %%eax        \n\t"
2303
                        "movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
2304
                        "movq (%1, %%"REG_BP", 8), %%mm1\n\t"
2305
                        "movq 16(%1, %%"REG_BP", 8), %%mm3\n\t"
2306
                        "movd (%3, %%"REG_a"), %%mm0        \n\t"
2307
                        "movd (%3, %%"REG_b"), %%mm2        \n\t"
2308
                        "punpcklbw %%mm7, %%mm0                \n\t"
2309
                        "punpcklbw %%mm7, %%mm2                \n\t"
2310
                        "pmaddwd %%mm1, %%mm0                \n\t"
2311
                        "pmaddwd %%mm2, %%mm3                \n\t"
2312

    
2313
                        "movq 8(%1, %%"REG_BP", 8), %%mm1\n\t"
2314
                        "movq 24(%1, %%"REG_BP", 8), %%mm5\n\t"
2315
                        "movd 4(%3, %%"REG_a"), %%mm4        \n\t"
2316
                        "movd 4(%3, %%"REG_b"), %%mm2        \n\t"
2317
                        "punpcklbw %%mm7, %%mm4                \n\t"
2318
                        "punpcklbw %%mm7, %%mm2                \n\t"
2319
                        "pmaddwd %%mm1, %%mm4                \n\t"
2320
                        "pmaddwd %%mm2, %%mm5                \n\t"
2321
                        "paddd %%mm4, %%mm0                \n\t"
2322
                        "paddd %%mm5, %%mm3                \n\t"
2323
                                                
2324
                        "psrad $8, %%mm0                \n\t"
2325
                        "psrad $8, %%mm3                \n\t"
2326
                        "packssdw %%mm3, %%mm0                \n\t"
2327
                        "pmaddwd %%mm6, %%mm0                \n\t"
2328
                        "packssdw %%mm0, %%mm0                \n\t"
2329
                        "movd %%mm0, (%4, %%"REG_BP")        \n\t"
2330
                        "add $4, %%"REG_BP"                \n\t"
2331
                        " jnc 1b                        \n\t"
2332

    
2333
                        "pop %%"REG_BP"                        \n\t"
2334
                        : "+a" (counter)
2335
                        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2336
                        : "%"REG_b
2337
                );
2338
        }
2339
        else
2340
        {
2341
                uint8_t *offset = src+filterSize;
2342
                long counter= -2*dstW;
2343
//                filter-= counter*filterSize/2;
2344
                filterPos-= counter/2;
2345
                dst-= counter/2;
2346
                asm volatile(
2347
                        "pxor %%mm7, %%mm7                \n\t"
2348
                        "movq "MANGLE(w02)", %%mm6        \n\t"
2349
                        ASMALIGN(4)
2350
                        "1:                                \n\t"
2351
                        "mov %2, %%"REG_c"                \n\t"
2352
                        "movzwl (%%"REG_c", %0), %%eax        \n\t"
2353
                        "movzwl 2(%%"REG_c", %0), %%ebx        \n\t"
2354
                        "mov %5, %%"REG_c"                \n\t"
2355
                        "pxor %%mm4, %%mm4                \n\t"
2356
                        "pxor %%mm5, %%mm5                \n\t"
2357
                        "2:                                \n\t"
2358
                        "movq (%1), %%mm1                \n\t"
2359
                        "movq (%1, %6), %%mm3                \n\t"
2360
                        "movd (%%"REG_c", %%"REG_a"), %%mm0\n\t"
2361
                        "movd (%%"REG_c", %%"REG_b"), %%mm2\n\t"
2362
                        "punpcklbw %%mm7, %%mm0                \n\t"
2363
                        "punpcklbw %%mm7, %%mm2                \n\t"
2364
                        "pmaddwd %%mm1, %%mm0                \n\t"
2365
                        "pmaddwd %%mm2, %%mm3                \n\t"
2366
                        "paddd %%mm3, %%mm5                \n\t"
2367
                        "paddd %%mm0, %%mm4                \n\t"
2368
                        "add $8, %1                        \n\t"
2369
                        "add $4, %%"REG_c"                \n\t"
2370
                        "cmp %4, %%"REG_c"                \n\t"
2371
                        " jb 2b                                \n\t"
2372
                        "add %6, %1                        \n\t"
2373
                        "psrad $8, %%mm4                \n\t"
2374
                        "psrad $8, %%mm5                \n\t"
2375
                        "packssdw %%mm5, %%mm4                \n\t"
2376
                        "pmaddwd %%mm6, %%mm4                \n\t"
2377
                        "packssdw %%mm4, %%mm4                \n\t"
2378
                        "mov %3, %%"REG_a"                \n\t"
2379
                        "movd %%mm4, (%%"REG_a", %0)        \n\t"
2380
                        "add $4, %0                        \n\t"
2381
                        " jnc 1b                        \n\t"
2382

    
2383
                        : "+r" (counter), "+r" (filter)
2384
                        : "m" (filterPos), "m" (dst), "m"(offset),
2385
                          "m" (src), "r" (filterSize*2)
2386
                        : "%"REG_b, "%"REG_a, "%"REG_c
2387
                );
2388
        }
2389
#else
2390
#ifdef HAVE_ALTIVEC
2391
        hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2392
#else
2393
        int i;
2394
        for(i=0; i<dstW; i++)
2395
        {
2396
                int j;
2397
                int srcPos= filterPos[i];
2398
                int val=0;
2399
//                printf("filterPos: %d\n", filterPos[i]);
2400
                for(j=0; j<filterSize; j++)
2401
                {
2402
//                        printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2403
                        val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2404
                }
2405
//                filter += hFilterSize;
2406
                dst[i] = FFMIN(FFMAX(0, val>>7), (1<<15)-1); // the cubic equation does overflow ...
2407
//                dst[i] = val>>7;
2408
        }
2409
#endif
2410
#endif
2411
}
2412
      // *** horizontal scale Y line to temp buffer
2413
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2414
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2415
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode, 
2416
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2417
                                   int32_t *mmx2FilterPos)
2418
{
2419
    if(srcFormat==IMGFMT_YUY2)
2420
    {
2421
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2422
        src= formatConvBuffer;
2423
    }
2424
    else if(srcFormat==IMGFMT_UYVY)
2425
    {
2426
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2427
        src= formatConvBuffer;
2428
    }
2429
    else if(srcFormat==IMGFMT_BGR32)
2430
    {
2431
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2432
        src= formatConvBuffer;
2433
    }
2434
    else if(srcFormat==IMGFMT_BGR24)
2435
    {
2436
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2437
        src= formatConvBuffer;
2438
    }
2439
    else if(srcFormat==IMGFMT_BGR16)
2440
    {
2441
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2442
        src= formatConvBuffer;
2443
    }
2444
    else if(srcFormat==IMGFMT_BGR15)
2445
    {
2446
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2447
        src= formatConvBuffer;
2448
    }
2449
    else if(srcFormat==IMGFMT_RGB32)
2450
    {
2451
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2452
        src= formatConvBuffer;
2453
    }
2454
    else if(srcFormat==IMGFMT_RGB24)
2455
    {
2456
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2457
        src= formatConvBuffer;
2458
    }
2459

    
2460
#ifdef HAVE_MMX
2461
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2462
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2463
#else
2464
    if(!(flags&SWS_FAST_BILINEAR))
2465
#endif
2466
    {
2467
            RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2468
    }
2469
    else // Fast Bilinear upscale / crap downscale
2470
    {
2471
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2472
#ifdef HAVE_MMX2
2473
        int i;
2474
        if(canMMX2BeUsed)
2475
        {
2476
                asm volatile(
2477
                        "pxor %%mm7, %%mm7                \n\t"
2478
                        "mov %0, %%"REG_c"                \n\t"
2479
                        "mov %1, %%"REG_D"                \n\t"
2480
                        "mov %2, %%"REG_d"                \n\t"
2481
                        "mov %3, %%"REG_b"                \n\t"
2482
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2483
                        PREFETCH" (%%"REG_c")                \n\t"
2484
                        PREFETCH" 32(%%"REG_c")                \n\t"
2485
                        PREFETCH" 64(%%"REG_c")                \n\t"
2486

    
2487
#ifdef ARCH_X86_64
2488

    
2489
#define FUNNY_Y_CODE \
2490
                        "movl (%%"REG_b"), %%esi        \n\t"\
2491
                        "call *%4                        \n\t"\
2492
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2493
                        "add %%"REG_S", %%"REG_c"        \n\t"\
2494
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2495
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2496

    
2497
#else
2498

    
2499
#define FUNNY_Y_CODE \
2500
                        "movl (%%"REG_b"), %%esi        \n\t"\
2501
                        "call *%4                        \n\t"\
2502
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2503
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2504
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2505

    
2506
#endif
2507

    
2508
FUNNY_Y_CODE
2509
FUNNY_Y_CODE
2510
FUNNY_Y_CODE
2511
FUNNY_Y_CODE
2512
FUNNY_Y_CODE
2513
FUNNY_Y_CODE
2514
FUNNY_Y_CODE
2515
FUNNY_Y_CODE
2516

    
2517
                        :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2518
                        "m" (funnyYCode)
2519
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2520
                );
2521
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2522
        }
2523
        else
2524
        {
2525
#endif
2526
        long xInc_shr16 = xInc >> 16;
2527
        uint16_t xInc_mask = xInc & 0xffff;
2528
        //NO MMX just normal asm ...
2529
        asm volatile(
2530
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2531
                "xor %%"REG_b", %%"REG_b"        \n\t" // xx
2532
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2533
                ASMALIGN(4)
2534
                "1:                                \n\t"
2535
                "movzbl  (%0, %%"REG_b"), %%edi        \n\t" //src[xx]
2536
                "movzbl 1(%0, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2537
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2538
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2539
                "shll $16, %%edi                \n\t"
2540
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2541
                "mov %1, %%"REG_D"                \n\t"
2542
                "shrl $9, %%esi                        \n\t"
2543
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2544
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2545
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2546

    
2547
                "movzbl (%0, %%"REG_b"), %%edi        \n\t" //src[xx]
2548
                "movzbl 1(%0, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2549
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2550
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2551
                "shll $16, %%edi                \n\t"
2552
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2553
                "mov %1, %%"REG_D"                \n\t"
2554
                "shrl $9, %%esi                        \n\t"
2555
                "movw %%si, 2(%%"REG_D", %%"REG_a", 2)\n\t"
2556
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2557
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2558

    
2559

    
2560
                "add $2, %%"REG_a"                \n\t"
2561
                "cmp %2, %%"REG_a"                \n\t"
2562
                " jb 1b                                \n\t"
2563

    
2564

    
2565
                :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2566
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2567
                );
2568
#ifdef HAVE_MMX2
2569
        } //if MMX2 can't be used
2570
#endif
2571
#else
2572
        int i;
2573
        unsigned int xpos=0;
2574
        for(i=0;i<dstWidth;i++)
2575
        {
2576
                register unsigned int xx=xpos>>16;
2577
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2578
                dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2579
                xpos+=xInc;
2580
        }
2581
#endif
2582
    }
2583
}
2584

    
2585
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2586
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2587
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2588
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2589
                                   int32_t *mmx2FilterPos)
2590
{
2591
    if(srcFormat==IMGFMT_YUY2)
2592
    {
2593
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2594
        src1= formatConvBuffer;
2595
        src2= formatConvBuffer+2048;
2596
    }
2597
    else if(srcFormat==IMGFMT_UYVY)
2598
    {
2599
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2600
        src1= formatConvBuffer;
2601
        src2= formatConvBuffer+2048;
2602
    }
2603
    else if(srcFormat==IMGFMT_BGR32)
2604
    {
2605
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2606
        src1= formatConvBuffer;
2607
        src2= formatConvBuffer+2048;
2608
    }
2609
    else if(srcFormat==IMGFMT_BGR24)
2610
    {
2611
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2612
        src1= formatConvBuffer;
2613
        src2= formatConvBuffer+2048;
2614
    }
2615
    else if(srcFormat==IMGFMT_BGR16)
2616
    {
2617
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2618
        src1= formatConvBuffer;
2619
        src2= formatConvBuffer+2048;
2620
    }
2621
    else if(srcFormat==IMGFMT_BGR15)
2622
    {
2623
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2624
        src1= formatConvBuffer;
2625
        src2= formatConvBuffer+2048;
2626
    }
2627
    else if(srcFormat==IMGFMT_RGB32)
2628
    {
2629
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2630
        src1= formatConvBuffer;
2631
        src2= formatConvBuffer+2048;
2632
    }
2633
    else if(srcFormat==IMGFMT_RGB24)
2634
    {
2635
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2636
        src1= formatConvBuffer;
2637
        src2= formatConvBuffer+2048;
2638
    }
2639
    else if(isGray(srcFormat))
2640
    {
2641
            return;
2642
    }
2643

    
2644
#ifdef HAVE_MMX
2645
        // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2646
    if(!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2647
#else
2648
    if(!(flags&SWS_FAST_BILINEAR))
2649
#endif
2650
    {
2651
            RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2652
            RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2653
    }
2654
    else // Fast Bilinear upscale / crap downscale
2655
    {
2656
#if defined(ARCH_X86) || defined(ARCH_X86_64)
2657
#ifdef HAVE_MMX2
2658
        int i;
2659
        if(canMMX2BeUsed)
2660
        {
2661
                asm volatile(
2662
                        "pxor %%mm7, %%mm7                \n\t"
2663
                        "mov %0, %%"REG_c"                \n\t"
2664
                        "mov %1, %%"REG_D"                \n\t"
2665
                        "mov %2, %%"REG_d"                \n\t"
2666
                        "mov %3, %%"REG_b"                \n\t"
2667
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2668
                        PREFETCH" (%%"REG_c")                \n\t"
2669
                        PREFETCH" 32(%%"REG_c")                \n\t"
2670
                        PREFETCH" 64(%%"REG_c")                \n\t"
2671

    
2672
#ifdef ARCH_X86_64
2673

    
2674
#define FUNNY_UV_CODE \
2675
                        "movl (%%"REG_b"), %%esi        \n\t"\
2676
                        "call *%4                        \n\t"\
2677
                        "movl (%%"REG_b", %%"REG_a"), %%esi\n\t"\
2678
                        "add %%"REG_S", %%"REG_c"        \n\t"\
2679
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2680
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2681

    
2682
#else
2683

    
2684
#define FUNNY_UV_CODE \
2685
                        "movl (%%"REG_b"), %%esi        \n\t"\
2686
                        "call *%4                        \n\t"\
2687
                        "addl (%%"REG_b", %%"REG_a"), %%"REG_c"\n\t"\
2688
                        "add %%"REG_a", %%"REG_D"        \n\t"\
2689
                        "xor %%"REG_a", %%"REG_a"        \n\t"\
2690

    
2691
#endif
2692

    
2693
FUNNY_UV_CODE
2694
FUNNY_UV_CODE
2695
FUNNY_UV_CODE
2696
FUNNY_UV_CODE
2697
                        "xor %%"REG_a", %%"REG_a"        \n\t" // i
2698
                        "mov %5, %%"REG_c"                \n\t" // src
2699
                        "mov %1, %%"REG_D"                \n\t" // buf1
2700
                        "add $4096, %%"REG_D"                \n\t"
2701
                        PREFETCH" (%%"REG_c")                \n\t"
2702
                        PREFETCH" 32(%%"REG_c")                \n\t"
2703
                        PREFETCH" 64(%%"REG_c")                \n\t"
2704

    
2705
FUNNY_UV_CODE
2706
FUNNY_UV_CODE
2707
FUNNY_UV_CODE
2708
FUNNY_UV_CODE
2709

    
2710
                        :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2711
                        "m" (funnyUVCode), "m" (src2)
2712
                        : "%"REG_a, "%"REG_b, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2713
                );
2714
                for(i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2715
                {
2716
//                        printf("%d %d %d\n", dstWidth, i, srcW);
2717
                        dst[i] = src1[srcW-1]*128;
2718
                        dst[i+2048] = src2[srcW-1]*128;
2719
                }
2720
        }
2721
        else
2722
        {
2723
#endif
2724
        long xInc_shr16 = (long) (xInc >> 16);
2725
        uint16_t xInc_mask = xInc & 0xffff; 
2726
        asm volatile(
2727
                "xor %%"REG_a", %%"REG_a"        \n\t" // i
2728
                "xor %%"REG_b", %%"REG_b"                \n\t" // xx
2729
                "xorl %%ecx, %%ecx                \n\t" // 2*xalpha
2730
                ASMALIGN(4)
2731
                "1:                                \n\t"
2732
                "mov %0, %%"REG_S"                \n\t"
2733
                "movzbl  (%%"REG_S", %%"REG_b"), %%edi        \n\t" //src[xx]
2734
                "movzbl 1(%%"REG_S", %%"REG_b"), %%esi        \n\t" //src[xx+1]
2735
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2736
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2737
                "shll $16, %%edi                \n\t"
2738
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2739
                "mov %1, %%"REG_D"                \n\t"
2740
                "shrl $9, %%esi                        \n\t"
2741
                "movw %%si, (%%"REG_D", %%"REG_a", 2)\n\t"
2742

    
2743
                "movzbl  (%5, %%"REG_b"), %%edi        \n\t" //src[xx]
2744
                "movzbl 1(%5, %%"REG_b"), %%esi        \n\t" //src[xx+1]
2745
                "subl %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2746
                "imull %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2747
                "shll $16, %%edi                \n\t"
2748
                "addl %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2749
                "mov %1, %%"REG_D"                \n\t"
2750
                "shrl $9, %%esi                        \n\t"
2751
                "movw %%si, 4096(%%"REG_D", %%"REG_a", 2)\n\t"
2752

    
2753
                "addw %4, %%cx                        \n\t" //2*xalpha += xInc&0xFF
2754
                "adc %3, %%"REG_b"                \n\t" //xx+= xInc>>8 + carry
2755
                "add $1, %%"REG_a"                \n\t"
2756
                "cmp %2, %%"REG_a"                \n\t"
2757
                " jb 1b                                \n\t"
2758

    
2759
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2760
   which is needed to support GCC-4.0 */
2761
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2762
                :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2763
#else
2764
                :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2765
#endif
2766
                "r" (src2)
2767
                : "%"REG_a, "%"REG_b, "%ecx", "%"REG_D, "%esi"
2768
                );
2769
#ifdef HAVE_MMX2
2770
        } //if MMX2 can't be used
2771
#endif
2772
#else
2773
        int i;
2774
        unsigned int xpos=0;
2775
        for(i=0;i<dstWidth;i++)
2776
        {
2777
                register unsigned int xx=xpos>>16;
2778
                register unsigned int xalpha=(xpos&0xFFFF)>>9;
2779
                dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2780
                dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2781
/* slower
2782
          dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2783
          dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2784
*/
2785
                xpos+=xInc;
2786
        }
2787
#endif
2788
   }
2789
}
2790

    
2791
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2792
             int srcSliceH, uint8_t* dst[], int dstStride[]){
2793

    
2794
        /* load a few things into local vars to make the code more readable? and faster */
2795
        const int srcW= c->srcW;
2796
        const int dstW= c->dstW;
2797
        const int dstH= c->dstH;
2798
        const int chrDstW= c->chrDstW;
2799
        const int chrSrcW= c->chrSrcW;
2800
        const int lumXInc= c->lumXInc;
2801
        const int chrXInc= c->chrXInc;
2802
        const int dstFormat= c->dstFormat;
2803
        const int srcFormat= c->srcFormat;
2804
        const int flags= c->flags;
2805
        const int canMMX2BeUsed= c->canMMX2BeUsed;
2806
        int16_t *vLumFilterPos= c->vLumFilterPos;
2807
        int16_t *vChrFilterPos= c->vChrFilterPos;
2808
        int16_t *hLumFilterPos= c->hLumFilterPos;
2809
        int16_t *hChrFilterPos= c->hChrFilterPos;
2810
        int16_t *vLumFilter= c->vLumFilter;
2811
        int16_t *vChrFilter= c->vChrFilter;
2812
        int16_t *hLumFilter= c->hLumFilter;
2813
        int16_t *hChrFilter= c->hChrFilter;
2814
        int32_t *lumMmxFilter= c->lumMmxFilter;
2815
        int32_t *chrMmxFilter= c->chrMmxFilter;
2816
        const int vLumFilterSize= c->vLumFilterSize;
2817
        const int vChrFilterSize= c->vChrFilterSize;
2818
        const int hLumFilterSize= c->hLumFilterSize;
2819
        const int hChrFilterSize= c->hChrFilterSize;
2820
        int16_t **lumPixBuf= c->lumPixBuf;
2821
        int16_t **chrPixBuf= c->chrPixBuf;
2822
        const int vLumBufSize= c->vLumBufSize;
2823
        const int vChrBufSize= c->vChrBufSize;
2824
        uint8_t *funnyYCode= c->funnyYCode;
2825
        uint8_t *funnyUVCode= c->funnyUVCode;
2826
        uint8_t *formatConvBuffer= c->formatConvBuffer;
2827
        const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2828
        const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2829
        int lastDstY;
2830

    
2831
        /* vars whch will change and which we need to storw back in the context */
2832
        int dstY= c->dstY;
2833
        int lumBufIndex= c->lumBufIndex;
2834
        int chrBufIndex= c->chrBufIndex;
2835
        int lastInLumBuf= c->lastInLumBuf;
2836
        int lastInChrBuf= c->lastInChrBuf;
2837
        
2838
        if(isPacked(c->srcFormat)){
2839
                src[0]=
2840
                src[1]=
2841
                src[2]= src[0];
2842
                srcStride[0]=
2843
                srcStride[1]=
2844
                srcStride[2]= srcStride[0];
2845
        }
2846
        srcStride[1]<<= c->vChrDrop;
2847
        srcStride[2]<<= c->vChrDrop;
2848

    
2849
//        printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2850
//                (int)dst[0], (int)dst[1], (int)dst[2]);
2851

    
2852
#if 0 //self test FIXME move to a vfilter or something
2853
{
2854
static volatile int i=0;
2855
i++;
2856
if(srcFormat==IMGFMT_YV12 && i==1 && srcSliceH>= c->srcH)
2857
        selfTest(src, srcStride, c->srcW, c->srcH);
2858
i--;
2859
}
2860
#endif
2861

    
2862
//printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2863
//dstStride[0],dstStride[1],dstStride[2]);
2864

    
2865
        if(dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2866
        {
2867
                static int firstTime=1; //FIXME move this into the context perhaps
2868
                if(flags & SWS_PRINT_INFO && firstTime)
2869
                {
2870
                        MSG_WARN("SwScaler: Warning: dstStride is not aligned!\n"
2871
                                        "SwScaler:          ->cannot do aligned memory acesses anymore\n");
2872
                        firstTime=0;
2873
                }
2874
        }
2875

    
2876
        /* Note the user might start scaling the picture in the middle so this will not get executed
2877
           this is not really intended but works currently, so ppl might do it */
2878
        if(srcSliceY ==0){
2879
                lumBufIndex=0;
2880
                chrBufIndex=0;
2881
                dstY=0;        
2882
                lastInLumBuf= -1;
2883
                lastInChrBuf= -1;
2884
        }
2885

    
2886
        lastDstY= dstY;
2887

    
2888
        for(;dstY < dstH; dstY++){
2889
                unsigned char *dest =dst[0]+dstStride[0]*dstY;
2890
                const int chrDstY= dstY>>c->chrDstVSubSample;
2891
                unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2892
                unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2893

    
2894
                const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2895
                const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2896
                const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2897
                const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2898

    
2899
//printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2900
// dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2901
                //handle holes (FAST_BILINEAR & weird filters)
2902
                if(firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2903
                if(firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2904
//printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2905
                ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
2906
                ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
2907

    
2908
                // Do we have enough lines in this slice to output the dstY line
2909
                if(lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2910
                {
2911
                        //Do horizontal scaling
2912
                        while(lastInLumBuf < lastLumSrcY)
2913
                        {
2914
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2915
                                lumBufIndex++;
2916
//                                printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2917
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2918
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2919
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2920
//                                printf("%d %d\n", lumBufIndex, vLumBufSize);
2921
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2922
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2923
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2924
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2925
                                lastInLumBuf++;
2926
                        }
2927
                        while(lastInChrBuf < lastChrSrcY)
2928
                        {
2929
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2930
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2931
                                chrBufIndex++;
2932
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2933
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
2934
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2935
                                //FIXME replace parameters through context struct (some at least)
2936

    
2937
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2938
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2939
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2940
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2941
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2942
                                lastInChrBuf++;
2943
                        }
2944
                        //wrap buf index around to stay inside the ring buffer
2945
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2946
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2947
                }
2948
                else // not enough lines left in this slice -> load the rest in the buffer
2949
                {
2950
/*                printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2951
                        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2952
                        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2953
                        vChrBufSize, vLumBufSize);*/
2954

    
2955
                        //Do horizontal scaling
2956
                        while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2957
                        {
2958
                                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2959
                                lumBufIndex++;
2960
                                ASSERT(lumBufIndex < 2*vLumBufSize)
2961
                                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
2962
                                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
2963
                                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2964
                                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2965
                                                funnyYCode, c->srcFormat, formatConvBuffer, 
2966
                                                c->lumMmx2Filter, c->lumMmx2FilterPos);
2967
                                lastInLumBuf++;
2968
                        }
2969
                        while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2970
                        {
2971
                                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2972
                                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2973
                                chrBufIndex++;
2974
                                ASSERT(chrBufIndex < 2*vChrBufSize)
2975
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
2976
                                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
2977

    
2978
                                if(!(isGray(srcFormat) || isGray(dstFormat)))
2979
                                        RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2980
                                                flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2981
                                                funnyUVCode, c->srcFormat, formatConvBuffer, 
2982
                                                c->chrMmx2Filter, c->chrMmx2FilterPos);
2983
                                lastInChrBuf++;
2984
                        }
2985
                        //wrap buf index around to stay inside the ring buffer
2986
                        if(lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
2987
                        if(chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
2988
                        break; //we can't output a dstY line so let's try with the next slice
2989
                }
2990

    
2991
#ifdef HAVE_MMX
2992
                b5Dither= dither8[dstY&1];
2993
                g6Dither= dither4[dstY&1];
2994
                g5Dither= dither8[dstY&1];
2995
                r5Dither= dither8[(dstY+1)&1];
2996
#endif
2997
            if(dstY < dstH-2)
2998
            {
2999
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3000
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3001
#ifdef HAVE_MMX
3002
                int i;
3003
            if(flags & SWS_ACCURATE_RND){
3004
                        for(i=0; i<vLumFilterSize; i+=2){
3005
                                lumMmxFilter[2*i+0]= lumSrcPtr[i  ];
3006
                                lumMmxFilter[2*i+1]= lumSrcPtr[i+(vLumFilterSize>1)];
3007
                                lumMmxFilter[2*i+2]=
3008
                                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3009
                                                + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3010
                        }
3011
                        for(i=0; i<vChrFilterSize; i+=2){
3012
                                chrMmxFilter[2*i+0]= chrSrcPtr[i  ];
3013
                                chrMmxFilter[2*i+1]= chrSrcPtr[i+(vChrFilterSize>1)];
3014
                                chrMmxFilter[2*i+2]=
3015
                                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3016
                                                + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3017
                        }
3018
            }else{
3019
                for(i=0; i<vLumFilterSize; i++)
3020
                {
3021
                        lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3022
                        lumMmxFilter[4*i+2]= 
3023
                        lumMmxFilter[4*i+3]= 
3024
                                ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3025
                }
3026
                for(i=0; i<vChrFilterSize; i++)
3027
                {
3028
                        chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3029
                        chrMmxFilter[4*i+2]= 
3030
                        chrMmxFilter[4*i+3]= 
3031
                                ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3032
                }
3033
            }
3034
#endif
3035
                if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
3036
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3037
                        if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3038
                        RENAME(yuv2nv12X)(c,
3039
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3040
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3041
                                dest, uDest, dstW, chrDstW, dstFormat);
3042
                }
3043
                else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3044
                {
3045
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3046
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3047
                        if(vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3048
                        {
3049
                                int16_t *lumBuf = lumPixBuf[0];
3050
                                int16_t *chrBuf= chrPixBuf[0];
3051
                                RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3052
                        }
3053
                        else //General YV12
3054
                        {
3055
                                RENAME(yuv2yuvX)(c,
3056
                                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3057
                                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3058
                                        dest, uDest, vDest, dstW, chrDstW);
3059
                        }
3060
                }
3061
                else
3062
                {
3063
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3064
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3065
                        if(vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3066
                        {
3067
                                int chrAlpha= vChrFilter[2*dstY+1];
3068
                                RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3069
                                                 dest, dstW, chrAlpha, dstFormat, flags, dstY);
3070
                        }
3071
                        else if(vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3072
                        {
3073
                                int lumAlpha= vLumFilter[2*dstY+1];
3074
                                int chrAlpha= vChrFilter[2*dstY+1];
3075
                                lumMmxFilter[2]=
3076
                                lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3077
                                chrMmxFilter[2]=
3078
                                chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3079
                                RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3080
                                                 dest, dstW, lumAlpha, chrAlpha, dstY);
3081
                        }
3082
                        else //General RGB
3083
                        {
3084
                                RENAME(yuv2packedX)(c,
3085
                                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3086
                                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3087
                                        dest, dstW, dstY);
3088
                        }
3089
                }
3090
            }
3091
            else // hmm looks like we can't use MMX here without overwriting this array's tail
3092
            {
3093
                int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3094
                int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3095
                if(dstFormat == IMGFMT_NV12 || dstFormat == IMGFMT_NV21){
3096
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3097
                        if(dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3098
                        yuv2nv12XinC(
3099
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3100
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3101
                                dest, uDest, dstW, chrDstW, dstFormat);
3102
                }
3103
                else if(isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3104
                {
3105
                        const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3106
                        if((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3107
                        yuv2yuvXinC(
3108
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3109
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3110
                                dest, uDest, vDest, dstW, chrDstW);
3111
                }
3112
                else
3113
                {
3114
                        ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3115
                        ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3116
                        yuv2packedXinC(c, 
3117
                                vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3118
                                vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3119
                                dest, dstW, dstY);
3120
                }
3121
            }
3122
        }
3123

    
3124
#ifdef HAVE_MMX
3125
        __asm __volatile(SFENCE:::"memory");
3126