Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 3164d25e

History | View | Annotate | Download (139 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#if HAVE_AMD3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#if HAVE_AMD3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif HAVE_MMX2
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#if HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#if HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif HAVE_AMD3DNOW
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#if HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#if HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    __asm__ volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    __asm__ volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddsw             %%mm7, %%mm0      \n\t"\
194
    "paddsw             %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX_UV \
210
    __asm__ volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
234
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
235
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
236
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
237
    "movq                    "#dst1", "#dst2"       \n\t"\
238
    ASMALIGN(4)\
239
    "2:                                             \n\t"\
240
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
241
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
242
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
243
    "add                         $16, %%"REG_d"            \n\t"\
244
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
245
    "pmulhw                 "#coeff", "#src1"       \n\t"\
246
    "pmulhw                 "#coeff", "#src2"       \n\t"\
247
    "paddw                   "#src1", "#dst1"       \n\t"\
248
    "paddw                   "#src2", "#dst2"       \n\t"\
249
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
250
    " jnz                         2b                \n\t"\
251

    
252
#define YSCALEYUV2PACKEDX \
253
    YSCALEYUV2PACKEDX_UV \
254
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
255

    
256
#define YSCALEYUV2PACKEDX_END                 \
257
    :: "r" (&c->redDither),                   \
258
        "m" (dummy), "m" (dummy), "m" (dummy),\
259
        "r" (dest), "m" (dstW)                \
260
    : "%"REG_a, "%"REG_d, "%"REG_S            \
261
    );
262

    
263
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
264
    __asm__ volatile(\
265
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
266
    ASMALIGN(4)\
267
    "nop                                            \n\t"\
268
    "1:                                             \n\t"\
269
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
270
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
271
    "pxor                      %%mm4, %%mm4         \n\t"\
272
    "pxor                      %%mm5, %%mm5         \n\t"\
273
    "pxor                      %%mm6, %%mm6         \n\t"\
274
    "pxor                      %%mm7, %%mm7         \n\t"\
275
    ASMALIGN(4)\
276
    "2:                                             \n\t"\
277
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
278
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
279
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
280
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
281
    "movq                      %%mm0, %%mm3         \n\t"\
282
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
283
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
284
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
285
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
286
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
287
    "paddd                     %%mm0, %%mm4         \n\t"\
288
    "paddd                     %%mm3, %%mm5         \n\t"\
289
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
290
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
291
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
292
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
293
    "movq                      %%mm2, %%mm0         \n\t"\
294
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
295
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
296
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
297
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
298
    "paddd                     %%mm2, %%mm6         \n\t"\
299
    "paddd                     %%mm0, %%mm7         \n\t"\
300
    " jnz                         2b                \n\t"\
301
    "psrad                       $16, %%mm4         \n\t"\
302
    "psrad                       $16, %%mm5         \n\t"\
303
    "psrad                       $16, %%mm6         \n\t"\
304
    "psrad                       $16, %%mm7         \n\t"\
305
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
306
    "packssdw                  %%mm5, %%mm4         \n\t"\
307
    "packssdw                  %%mm7, %%mm6         \n\t"\
308
    "paddw                     %%mm0, %%mm4         \n\t"\
309
    "paddw                     %%mm0, %%mm6         \n\t"\
310
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
311
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
312

    
313
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
314
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
315
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
316
    "pxor                      %%mm1, %%mm1         \n\t"\
317
    "pxor                      %%mm5, %%mm5         \n\t"\
318
    "pxor                      %%mm7, %%mm7         \n\t"\
319
    "pxor                      %%mm6, %%mm6         \n\t"\
320
    ASMALIGN(4)\
321
    "2:                                             \n\t"\
322
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
323
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
324
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
325
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
326
    "movq                      %%mm0, %%mm3         \n\t"\
327
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
328
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
329
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
330
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
331
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
332
    "paddd                     %%mm0, %%mm1         \n\t"\
333
    "paddd                     %%mm3, %%mm5         \n\t"\
334
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
335
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
336
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
337
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
338
    "movq                      %%mm2, %%mm0         \n\t"\
339
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
340
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
341
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
342
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
343
    "paddd                     %%mm2, %%mm7         \n\t"\
344
    "paddd                     %%mm0, %%mm6         \n\t"\
345
    " jnz                         2b                \n\t"\
346
    "psrad                       $16, %%mm1         \n\t"\
347
    "psrad                       $16, %%mm5         \n\t"\
348
    "psrad                       $16, %%mm7         \n\t"\
349
    "psrad                       $16, %%mm6         \n\t"\
350
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
351
    "packssdw                  %%mm5, %%mm1         \n\t"\
352
    "packssdw                  %%mm6, %%mm7         \n\t"\
353
    "paddw                     %%mm0, %%mm1         \n\t"\
354
    "paddw                     %%mm0, %%mm7         \n\t"\
355
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
356
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
357

    
358
#define YSCALEYUV2PACKEDX_ACCURATE \
359
    YSCALEYUV2PACKEDX_ACCURATE_UV \
360
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
361

    
362
#define YSCALEYUV2RGBX \
363
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
364
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
365
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
366
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
367
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
368
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
369
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
370
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
371
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
372
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
373
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
374
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
375
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
376
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
377
    "paddw           %%mm3, %%mm4       \n\t"\
378
    "movq            %%mm2, %%mm0       \n\t"\
379
    "movq            %%mm5, %%mm6       \n\t"\
380
    "movq            %%mm4, %%mm3       \n\t"\
381
    "punpcklwd       %%mm2, %%mm2       \n\t"\
382
    "punpcklwd       %%mm5, %%mm5       \n\t"\
383
    "punpcklwd       %%mm4, %%mm4       \n\t"\
384
    "paddw           %%mm1, %%mm2       \n\t"\
385
    "paddw           %%mm1, %%mm5       \n\t"\
386
    "paddw           %%mm1, %%mm4       \n\t"\
387
    "punpckhwd       %%mm0, %%mm0       \n\t"\
388
    "punpckhwd       %%mm6, %%mm6       \n\t"\
389
    "punpckhwd       %%mm3, %%mm3       \n\t"\
390
    "paddw           %%mm7, %%mm0       \n\t"\
391
    "paddw           %%mm7, %%mm6       \n\t"\
392
    "paddw           %%mm7, %%mm3       \n\t"\
393
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
394
    "packuswb        %%mm0, %%mm2       \n\t"\
395
    "packuswb        %%mm6, %%mm5       \n\t"\
396
    "packuswb        %%mm3, %%mm4       \n\t"\
397

    
398
#define REAL_YSCALEYUV2PACKED(index, c) \
399
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
400
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
401
    "psraw                $3, %%mm0                           \n\t"\
402
    "psraw                $3, %%mm1                           \n\t"\
403
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
404
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
405
    "xor            "#index", "#index"                        \n\t"\
406
    ASMALIGN(4)\
407
    "1:                                 \n\t"\
408
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
409
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
410
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
411
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
412
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
413
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
414
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
415
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
416
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
417
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
418
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
419
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
420
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
421
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
422
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
423
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
424
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
425
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
426
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
427
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
428
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
429
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
430
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
431
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
432
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
433

    
434
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
435

    
436
#define REAL_YSCALEYUV2RGB_UV(index, c) \
437
    "xor            "#index", "#index"  \n\t"\
438
    ASMALIGN(4)\
439
    "1:                                 \n\t"\
440
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
441
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
442
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
443
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
444
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
445
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
446
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
447
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
448
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
449
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
450
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
451
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
452
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
453
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
454
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
455
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
456
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
457
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
458
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
459
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
460

    
461
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
462
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
463
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
464
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
465
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
466
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
467
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
468
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
469
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
470
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
471
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
472
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
473
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
474

    
475
#define REAL_YSCALEYUV2RGB_COEFF(c) \
476
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
477
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
478
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
479
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
480
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
481
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
482
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
483
    "paddw             %%mm3, %%mm4     \n\t"\
484
    "movq              %%mm2, %%mm0     \n\t"\
485
    "movq              %%mm5, %%mm6     \n\t"\
486
    "movq              %%mm4, %%mm3     \n\t"\
487
    "punpcklwd         %%mm2, %%mm2     \n\t"\
488
    "punpcklwd         %%mm5, %%mm5     \n\t"\
489
    "punpcklwd         %%mm4, %%mm4     \n\t"\
490
    "paddw             %%mm1, %%mm2     \n\t"\
491
    "paddw             %%mm1, %%mm5     \n\t"\
492
    "paddw             %%mm1, %%mm4     \n\t"\
493
    "punpckhwd         %%mm0, %%mm0     \n\t"\
494
    "punpckhwd         %%mm6, %%mm6     \n\t"\
495
    "punpckhwd         %%mm3, %%mm3     \n\t"\
496
    "paddw             %%mm7, %%mm0     \n\t"\
497
    "paddw             %%mm7, %%mm6     \n\t"\
498
    "paddw             %%mm7, %%mm3     \n\t"\
499
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
500
    "packuswb          %%mm0, %%mm2     \n\t"\
501
    "packuswb          %%mm6, %%mm5     \n\t"\
502
    "packuswb          %%mm3, %%mm4     \n\t"\
503

    
504
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
505

    
506
#define YSCALEYUV2RGB(index, c) \
507
    REAL_YSCALEYUV2RGB_UV(index, c) \
508
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
509
    REAL_YSCALEYUV2RGB_COEFF(c)
510

    
511
#define REAL_YSCALEYUV2PACKED1(index, c) \
512
    "xor            "#index", "#index"  \n\t"\
513
    ASMALIGN(4)\
514
    "1:                                 \n\t"\
515
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517
    "psraw                $7, %%mm3     \n\t" \
518
    "psraw                $7, %%mm4     \n\t" \
519
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521
    "psraw                $7, %%mm1     \n\t" \
522
    "psraw                $7, %%mm7     \n\t" \
523

    
524
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
525

    
526
#define REAL_YSCALEYUV2RGB1(index, c) \
527
    "xor            "#index", "#index"  \n\t"\
528
    ASMALIGN(4)\
529
    "1:                                 \n\t"\
530
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
531
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
532
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
533
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
534
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
535
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
536
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
537
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
538
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
539
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
540
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
541
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
542
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
543
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
544
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
545
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
546
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
547
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
548
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
549
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
550
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
551
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
552
    "paddw             %%mm3, %%mm4     \n\t"\
553
    "movq              %%mm2, %%mm0     \n\t"\
554
    "movq              %%mm5, %%mm6     \n\t"\
555
    "movq              %%mm4, %%mm3     \n\t"\
556
    "punpcklwd         %%mm2, %%mm2     \n\t"\
557
    "punpcklwd         %%mm5, %%mm5     \n\t"\
558
    "punpcklwd         %%mm4, %%mm4     \n\t"\
559
    "paddw             %%mm1, %%mm2     \n\t"\
560
    "paddw             %%mm1, %%mm5     \n\t"\
561
    "paddw             %%mm1, %%mm4     \n\t"\
562
    "punpckhwd         %%mm0, %%mm0     \n\t"\
563
    "punpckhwd         %%mm6, %%mm6     \n\t"\
564
    "punpckhwd         %%mm3, %%mm3     \n\t"\
565
    "paddw             %%mm7, %%mm0     \n\t"\
566
    "paddw             %%mm7, %%mm6     \n\t"\
567
    "paddw             %%mm7, %%mm3     \n\t"\
568
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
569
    "packuswb          %%mm0, %%mm2     \n\t"\
570
    "packuswb          %%mm6, %%mm5     \n\t"\
571
    "packuswb          %%mm3, %%mm4     \n\t"\
572

    
573
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
574

    
575
#define REAL_YSCALEYUV2PACKED1b(index, c) \
576
    "xor "#index", "#index"             \n\t"\
577
    ASMALIGN(4)\
578
    "1:                                 \n\t"\
579
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
580
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
581
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
582
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
583
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585
    "psrlw                $8, %%mm3     \n\t" \
586
    "psrlw                $8, %%mm4     \n\t" \
587
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
588
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
589
    "psraw                $7, %%mm1     \n\t" \
590
    "psraw                $7, %%mm7     \n\t"
591
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
592

    
593
// do vertical chrominance interpolation
594
#define REAL_YSCALEYUV2RGB1b(index, c) \
595
    "xor            "#index", "#index"  \n\t"\
596
    ASMALIGN(4)\
597
    "1:                                 \n\t"\
598
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
599
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
600
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
601
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
602
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
603
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
604
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
605
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
606
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
607
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
608
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
609
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
610
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
611
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
612
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
613
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
614
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
615
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
616
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
617
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
618
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
619
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
620
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
621
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
622
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
623
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
624
    "paddw             %%mm3, %%mm4     \n\t"\
625
    "movq              %%mm2, %%mm0     \n\t"\
626
    "movq              %%mm5, %%mm6     \n\t"\
627
    "movq              %%mm4, %%mm3     \n\t"\
628
    "punpcklwd         %%mm2, %%mm2     \n\t"\
629
    "punpcklwd         %%mm5, %%mm5     \n\t"\
630
    "punpcklwd         %%mm4, %%mm4     \n\t"\
631
    "paddw             %%mm1, %%mm2     \n\t"\
632
    "paddw             %%mm1, %%mm5     \n\t"\
633
    "paddw             %%mm1, %%mm4     \n\t"\
634
    "punpckhwd         %%mm0, %%mm0     \n\t"\
635
    "punpckhwd         %%mm6, %%mm6     \n\t"\
636
    "punpckhwd         %%mm3, %%mm3     \n\t"\
637
    "paddw             %%mm7, %%mm0     \n\t"\
638
    "paddw             %%mm7, %%mm6     \n\t"\
639
    "paddw             %%mm7, %%mm3     \n\t"\
640
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
641
    "packuswb          %%mm0, %%mm2     \n\t"\
642
    "packuswb          %%mm6, %%mm5     \n\t"\
643
    "packuswb          %%mm3, %%mm4     \n\t"\
644

    
645
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
646

    
647
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
648
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
649
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
650
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
651
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
652
    "packuswb          %%mm1, %%mm7     \n\t"
653
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
654

    
655
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
656
    "movq       "#b", "#q2"     \n\t" /* B */\
657
    "movq       "#r", "#t"      \n\t" /* R */\
658
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
659
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
660
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
661
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
662
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
663
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
664
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
665
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
666
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
667
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
668
\
669
    MOVNTQ(   q0,   (dst, index, 4))\
670
    MOVNTQ(    b,  8(dst, index, 4))\
671
    MOVNTQ(   q2, 16(dst, index, 4))\
672
    MOVNTQ(   q3, 24(dst, index, 4))\
673
\
674
    "add      $8, "#index"      \n\t"\
675
    "cmp "#dstw", "#index"      \n\t"\
676
    " jb      1b                \n\t"
677
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
678

    
679
#define REAL_WRITERGB16(dst, dstw, index) \
680
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
681
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
682
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
683
    "psrlq           $3, %%mm2  \n\t"\
684
\
685
    "movq         %%mm2, %%mm1  \n\t"\
686
    "movq         %%mm4, %%mm3  \n\t"\
687
\
688
    "punpcklbw    %%mm7, %%mm3  \n\t"\
689
    "punpcklbw    %%mm5, %%mm2  \n\t"\
690
    "punpckhbw    %%mm7, %%mm4  \n\t"\
691
    "punpckhbw    %%mm5, %%mm1  \n\t"\
692
\
693
    "psllq           $3, %%mm3  \n\t"\
694
    "psllq           $3, %%mm4  \n\t"\
695
\
696
    "por          %%mm3, %%mm2  \n\t"\
697
    "por          %%mm4, %%mm1  \n\t"\
698
\
699
    MOVNTQ(%%mm2,  (dst, index, 2))\
700
    MOVNTQ(%%mm1, 8(dst, index, 2))\
701
\
702
    "add             $8, "#index"   \n\t"\
703
    "cmp        "#dstw", "#index"   \n\t"\
704
    " jb             1b             \n\t"
705
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
706

    
707
#define REAL_WRITERGB15(dst, dstw, index) \
708
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
709
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
710
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
711
    "psrlq           $3, %%mm2  \n\t"\
712
    "psrlq           $1, %%mm5  \n\t"\
713
\
714
    "movq         %%mm2, %%mm1  \n\t"\
715
    "movq         %%mm4, %%mm3  \n\t"\
716
\
717
    "punpcklbw    %%mm7, %%mm3  \n\t"\
718
    "punpcklbw    %%mm5, %%mm2  \n\t"\
719
    "punpckhbw    %%mm7, %%mm4  \n\t"\
720
    "punpckhbw    %%mm5, %%mm1  \n\t"\
721
\
722
    "psllq           $2, %%mm3  \n\t"\
723
    "psllq           $2, %%mm4  \n\t"\
724
\
725
    "por          %%mm3, %%mm2  \n\t"\
726
    "por          %%mm4, %%mm1  \n\t"\
727
\
728
    MOVNTQ(%%mm2,  (dst, index, 2))\
729
    MOVNTQ(%%mm1, 8(dst, index, 2))\
730
\
731
    "add             $8, "#index"   \n\t"\
732
    "cmp        "#dstw", "#index"   \n\t"\
733
    " jb             1b             \n\t"
734
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
735

    
736
#define WRITEBGR24OLD(dst, dstw, index) \
737
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
738
    "movq      %%mm2, %%mm1             \n\t" /* B */\
739
    "movq      %%mm5, %%mm6             \n\t" /* R */\
740
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
741
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
742
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
743
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
744
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
745
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
746
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
747
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
748
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
749
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
750
\
751
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
752
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
753
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
754
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
755
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
756
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
757
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
758
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
759
\
760
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
761
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
762
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
763
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
764
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
765
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
766
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
767
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
768
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
769
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
770
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
771
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
772
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
773
\
774
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
775
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
776
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
777
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
778
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
779
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
780
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
781
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
782
\
783
    MOVNTQ(%%mm0,   (dst))\
784
    MOVNTQ(%%mm2,  8(dst))\
785
    MOVNTQ(%%mm3, 16(dst))\
786
    "add         $24, "#dst"            \n\t"\
787
\
788
    "add          $8, "#index"          \n\t"\
789
    "cmp     "#dstw", "#index"          \n\t"\
790
    " jb          1b                    \n\t"
791

    
792
#define WRITEBGR24MMX(dst, dstw, index) \
793
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
794
    "movq      %%mm2, %%mm1     \n\t" /* B */\
795
    "movq      %%mm5, %%mm6     \n\t" /* R */\
796
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
797
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
798
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
799
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
800
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
801
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
802
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
803
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
804
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
805
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
806
\
807
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
808
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
809
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
810
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
811
\
812
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
813
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
814
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
815
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
816
\
817
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
818
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
819
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
820
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
821
\
822
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
823
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
824
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
825
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
826
    MOVNTQ(%%mm0, (dst))\
827
\
828
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
829
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
830
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
831
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
832
    MOVNTQ(%%mm6, 8(dst))\
833
\
834
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
835
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
836
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
837
    MOVNTQ(%%mm5, 16(dst))\
838
\
839
    "add         $24, "#dst"    \n\t"\
840
\
841
    "add          $8, "#index"  \n\t"\
842
    "cmp     "#dstw", "#index"  \n\t"\
843
    " jb          1b            \n\t"
844

    
845
#define WRITEBGR24MMX2(dst, dstw, index) \
846
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
847
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
848
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
849
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
850
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
851
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
852
\
853
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
854
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
855
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
856
\
857
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
858
    "por    %%mm1, %%mm6        \n\t"\
859
    "por    %%mm3, %%mm6        \n\t"\
860
    MOVNTQ(%%mm6, (dst))\
861
\
862
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
863
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
864
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
865
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
866
\
867
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
868
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
869
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
870
\
871
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
872
    "por    %%mm3, %%mm6        \n\t"\
873
    MOVNTQ(%%mm6, 8(dst))\
874
\
875
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
876
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
877
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
878
\
879
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
880
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
881
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
882
\
883
    "por    %%mm1, %%mm3        \n\t"\
884
    "por    %%mm3, %%mm6        \n\t"\
885
    MOVNTQ(%%mm6, 16(dst))\
886
\
887
    "add      $24, "#dst"       \n\t"\
888
\
889
    "add       $8, "#index"     \n\t"\
890
    "cmp  "#dstw", "#index"     \n\t"\
891
    " jb       1b               \n\t"
892

    
893
#if HAVE_MMX2
894
#undef WRITEBGR24
895
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
896
#else
897
#undef WRITEBGR24
898
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
899
#endif
900

    
901
#define REAL_WRITEYUY2(dst, dstw, index) \
902
    "packuswb  %%mm3, %%mm3     \n\t"\
903
    "packuswb  %%mm4, %%mm4     \n\t"\
904
    "packuswb  %%mm7, %%mm1     \n\t"\
905
    "punpcklbw %%mm4, %%mm3     \n\t"\
906
    "movq      %%mm1, %%mm7     \n\t"\
907
    "punpcklbw %%mm3, %%mm1     \n\t"\
908
    "punpckhbw %%mm3, %%mm7     \n\t"\
909
\
910
    MOVNTQ(%%mm1, (dst, index, 2))\
911
    MOVNTQ(%%mm7, 8(dst, index, 2))\
912
\
913
    "add          $8, "#index"  \n\t"\
914
    "cmp     "#dstw", "#index"  \n\t"\
915
    " jb          1b            \n\t"
916
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
917

    
918

    
919
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
920
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize, int16_t **alpSrc,
921
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
922
{
923
#if HAVE_MMX
924
    if(!(c->flags & SWS_BITEXACT)){
925
        if (c->flags & SWS_ACCURATE_RND){
926
            if (uDest){
927
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
928
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
929
            }
930
            if (CONFIG_SWSCALE_ALPHA && aDest){
931
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
932
            }
933

    
934
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
935
        }else{
936
            if (uDest){
937
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939
            }
940
            if (CONFIG_SWSCALE_ALPHA && aDest){
941
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
942
            }
943

    
944
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
945
        }
946
        return;
947
    }
948
#endif
949
#if HAVE_ALTIVEC
950
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
951
                      chrFilter, chrSrc, chrFilterSize,
952
                      dest, uDest, vDest, dstW, chrDstW);
953
#else //HAVE_ALTIVEC
954
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
955
            chrFilter, chrSrc, chrFilterSize,
956
            alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
957
#endif //!HAVE_ALTIVEC
958
}
959

    
960
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
961
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
962
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
963
{
964
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
965
             chrFilter, chrSrc, chrFilterSize,
966
             dest, uDest, dstW, chrDstW, dstFormat);
967
}
968

    
969
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc, int16_t *alpSrc,
970
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
971
{
972
    int i;
973
#if HAVE_MMX
974
    if(!(c->flags & SWS_BITEXACT)){
975
        long p= 4;
976
        uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
977
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
978
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
979

    
980
        if (c->flags & SWS_ACCURATE_RND){
981
            while(p--){
982
                if (dst[p]){
983
                    __asm__ volatile(
984
                        YSCALEYUV2YV121_ACCURATE
985
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
986
                        "g" (-counter[p])
987
                        : "%"REG_a
988
                    );
989
                }
990
            }
991
        }else{
992
            while(p--){
993
                if (dst[p]){
994
                    __asm__ volatile(
995
                        YSCALEYUV2YV121
996
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
997
                        "g" (-counter[p])
998
                        : "%"REG_a
999
                    );
1000
                }
1001
            }
1002
        }
1003
        return;
1004
    }
1005
#endif
1006
    for (i=0; i<dstW; i++)
1007
    {
1008
        int val= (lumSrc[i]+64)>>7;
1009

    
1010
        if (val&256){
1011
            if (val<0) val=0;
1012
            else       val=255;
1013
        }
1014

    
1015
        dest[i]= val;
1016
    }
1017

    
1018
    if (uDest)
1019
        for (i=0; i<chrDstW; i++)
1020
        {
1021
            int u=(chrSrc[i       ]+64)>>7;
1022
            int v=(chrSrc[i + VOFW]+64)>>7;
1023

    
1024
            if ((u|v)&256){
1025
                if (u<0)        u=0;
1026
                else if (u>255) u=255;
1027
                if (v<0)        v=0;
1028
                else if (v>255) v=255;
1029
            }
1030

    
1031
            uDest[i]= u;
1032
            vDest[i]= v;
1033
        }
1034

    
1035
    if (CONFIG_SWSCALE_ALPHA && aDest)
1036
        for (i=0; i<dstW; i++){
1037
            int val= (alpSrc[i]+64)>>7;
1038
            aDest[i]= av_clip_uint8(val);
1039
        }
1040
}
1041

    
1042

    
1043
/**
1044
 * vertical scale YV12 to RGB
1045
 */
1046
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1047
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1048
                                       int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1049
{
1050
#if HAVE_MMX
1051
    x86_reg dummy=0;
1052
    if(!(c->flags & SWS_BITEXACT)){
1053
        if (c->flags & SWS_ACCURATE_RND){
1054
            switch(c->dstFormat){
1055
            case PIX_FMT_RGB32:
1056
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1057
                    YSCALEYUV2PACKEDX_ACCURATE
1058
                    YSCALEYUV2RGBX
1059
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1060
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1061
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1062
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1063
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1064
                    "psraw                        $3, %%mm1         \n\t"
1065
                    "psraw                        $3, %%mm7         \n\t"
1066
                    "packuswb                  %%mm7, %%mm1         \n\t"
1067
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1068

    
1069
                    YSCALEYUV2PACKEDX_END
1070
                }else{
1071
                    YSCALEYUV2PACKEDX_ACCURATE
1072
                    YSCALEYUV2RGBX
1073
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1074
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1075

    
1076
                    YSCALEYUV2PACKEDX_END
1077
                }
1078
                return;
1079
            case PIX_FMT_BGR24:
1080
                YSCALEYUV2PACKEDX_ACCURATE
1081
                YSCALEYUV2RGBX
1082
                "pxor %%mm7, %%mm7 \n\t"
1083
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1084
                "add %4, %%"REG_c"                        \n\t"
1085
                WRITEBGR24(%%REGc, %5, %%REGa)
1086

    
1087

    
1088
                :: "r" (&c->redDither),
1089
                "m" (dummy), "m" (dummy), "m" (dummy),
1090
                "r" (dest), "m" (dstW)
1091
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1092
                );
1093
                return;
1094
            case PIX_FMT_RGB555:
1095
                YSCALEYUV2PACKEDX_ACCURATE
1096
                YSCALEYUV2RGBX
1097
                "pxor %%mm7, %%mm7 \n\t"
1098
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1099
#ifdef DITHER1XBPP
1100
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1101
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1102
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1103
#endif
1104

    
1105
                WRITERGB15(%4, %5, %%REGa)
1106
                YSCALEYUV2PACKEDX_END
1107
                return;
1108
            case PIX_FMT_RGB565:
1109
                YSCALEYUV2PACKEDX_ACCURATE
1110
                YSCALEYUV2RGBX
1111
                "pxor %%mm7, %%mm7 \n\t"
1112
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1113
#ifdef DITHER1XBPP
1114
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1115
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1116
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1117
#endif
1118

    
1119
                WRITERGB16(%4, %5, %%REGa)
1120
                YSCALEYUV2PACKEDX_END
1121
                return;
1122
            case PIX_FMT_YUYV422:
1123
                YSCALEYUV2PACKEDX_ACCURATE
1124
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1125

    
1126
                "psraw $3, %%mm3    \n\t"
1127
                "psraw $3, %%mm4    \n\t"
1128
                "psraw $3, %%mm1    \n\t"
1129
                "psraw $3, %%mm7    \n\t"
1130
                WRITEYUY2(%4, %5, %%REGa)
1131
                YSCALEYUV2PACKEDX_END
1132
                return;
1133
            }
1134
        }else{
1135
            switch(c->dstFormat)
1136
            {
1137
            case PIX_FMT_RGB32:
1138
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1139
                    YSCALEYUV2PACKEDX
1140
                    YSCALEYUV2RGBX
1141
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1142
                    "psraw                        $3, %%mm1         \n\t"
1143
                    "psraw                        $3, %%mm7         \n\t"
1144
                    "packuswb                  %%mm7, %%mm1         \n\t"
1145
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1146
                    YSCALEYUV2PACKEDX_END
1147
                }else{
1148
                    YSCALEYUV2PACKEDX
1149
                    YSCALEYUV2RGBX
1150
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1151
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1152
                    YSCALEYUV2PACKEDX_END
1153
                }
1154
                return;
1155
            case PIX_FMT_BGR24:
1156
                YSCALEYUV2PACKEDX
1157
                YSCALEYUV2RGBX
1158
                "pxor                    %%mm7, %%mm7       \n\t"
1159
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1160
                "add                        %4, %%"REG_c"   \n\t"
1161
                WRITEBGR24(%%REGc, %5, %%REGa)
1162

    
1163
                :: "r" (&c->redDither),
1164
                "m" (dummy), "m" (dummy), "m" (dummy),
1165
                "r" (dest),  "m" (dstW)
1166
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1167
                );
1168
                return;
1169
            case PIX_FMT_RGB555:
1170
                YSCALEYUV2PACKEDX
1171
                YSCALEYUV2RGBX
1172
                "pxor %%mm7, %%mm7 \n\t"
1173
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174
#ifdef DITHER1XBPP
1175
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1176
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1177
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1178
#endif
1179

    
1180
                WRITERGB15(%4, %5, %%REGa)
1181
                YSCALEYUV2PACKEDX_END
1182
                return;
1183
            case PIX_FMT_RGB565:
1184
                YSCALEYUV2PACKEDX
1185
                YSCALEYUV2RGBX
1186
                "pxor %%mm7, %%mm7 \n\t"
1187
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1188
#ifdef DITHER1XBPP
1189
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1190
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1191
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1192
#endif
1193

    
1194
                WRITERGB16(%4, %5, %%REGa)
1195
                YSCALEYUV2PACKEDX_END
1196
                return;
1197
            case PIX_FMT_YUYV422:
1198
                YSCALEYUV2PACKEDX
1199
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1200

    
1201
                "psraw $3, %%mm3    \n\t"
1202
                "psraw $3, %%mm4    \n\t"
1203
                "psraw $3, %%mm1    \n\t"
1204
                "psraw $3, %%mm7    \n\t"
1205
                WRITEYUY2(%4, %5, %%REGa)
1206
                YSCALEYUV2PACKEDX_END
1207
                return;
1208
            }
1209
        }
1210
    }
1211
#endif /* HAVE_MMX */
1212
#if HAVE_ALTIVEC
1213
    /* The following list of supported dstFormat values should
1214
       match what's found in the body of ff_yuv2packedX_altivec() */
1215
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf
1216
       (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1217
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1218
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1219
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1220
                                   chrFilter, chrSrc, chrFilterSize,
1221
                                   dest, dstW, dstY);
1222
    else
1223
#endif
1224
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1225
                       chrFilter, chrSrc, chrFilterSize,
1226
                       alpSrc, dest, dstW, dstY);
1227
}
1228

    
1229
/**
1230
 * vertical bilinear scale YV12 to RGB
1231
 */
1232
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1233
                          uint16_t *abuf0, uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1234
{
1235
    int  yalpha1=4095- yalpha;
1236
    int uvalpha1=4095-uvalpha;
1237
    int i;
1238

    
1239
#if HAVE_MMX
1240
    if(!(c->flags & SWS_BITEXACT)){
1241
        switch(c->dstFormat)
1242
        {
1243
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1244
            case PIX_FMT_RGB32:
1245
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1246
#if ARCH_X86_64
1247
                    __asm__ volatile(
1248
                    "mov        %4, %%"REG_b"               \n\t"
1249
                    YSCALEYUV2RGB(%%REGBP, %5)
1250
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1251
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1253
                    "packuswb            %%mm7, %%mm1       \n\t"
1254
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1255

    
1256
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1257
                    "a" (&c->redDither)
1258
                    ,"r" (abuf0), "r" (abuf1)
1259
                    : "%"REG_b, "%"REG_BP
1260
                    );
1261
#else
1262
                    *(uint16_t **)(&c->u_temp)=abuf0;
1263
                    *(uint16_t **)(&c->v_temp)=abuf1;
1264
                    __asm__ volatile(
1265
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1266
                    "mov        %4, %%"REG_b"               \n\t"
1267
                    "push %%"REG_BP"                        \n\t"
1268
                    YSCALEYUV2RGB(%%REGBP, %5)
1269
                    "push                   %0              \n\t"
1270
                    "push                   %1              \n\t"
1271
                    "mov          "U_TEMP"(%5), %0          \n\t"
1272
                    "mov          "V_TEMP"(%5), %1          \n\t"
1273
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1274
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1275
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1276
                    "packuswb            %%mm7, %%mm1       \n\t"
1277
                    "pop                    %1              \n\t"
1278
                    "pop                    %0              \n\t"
1279
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1280
                    "pop %%"REG_BP"                         \n\t"
1281
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1282

    
1283
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1284
                    "a" (&c->redDither)
1285
                    );
1286
#endif
1287
                }else{
1288
                    __asm__ volatile(
1289
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1290
                    "mov        %4, %%"REG_b"               \n\t"
1291
                    "push %%"REG_BP"                        \n\t"
1292
                    YSCALEYUV2RGB(%%REGBP, %5)
1293
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1294
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1295
                    "pop %%"REG_BP"                         \n\t"
1296
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1297

    
1298
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1299
                    "a" (&c->redDither)
1300
                    );
1301
                }
1302
                return;
1303
            case PIX_FMT_BGR24:
1304
                __asm__ volatile(
1305
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1306
                "mov        %4, %%"REG_b"               \n\t"
1307
                "push %%"REG_BP"                        \n\t"
1308
                YSCALEYUV2RGB(%%REGBP, %5)
1309
                "pxor    %%mm7, %%mm7                   \n\t"
1310
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1311
                "pop %%"REG_BP"                         \n\t"
1312
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1313
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1314
                "a" (&c->redDither)
1315
                );
1316
                return;
1317
            case PIX_FMT_RGB555:
1318
                __asm__ volatile(
1319
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1320
                "mov        %4, %%"REG_b"               \n\t"
1321
                "push %%"REG_BP"                        \n\t"
1322
                YSCALEYUV2RGB(%%REGBP, %5)
1323
                "pxor    %%mm7, %%mm7                   \n\t"
1324
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1325
#ifdef DITHER1XBPP
1326
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1327
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1328
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1329
#endif
1330

    
1331
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1332
                "pop %%"REG_BP"                         \n\t"
1333
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1334

    
1335
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1336
                "a" (&c->redDither)
1337
                );
1338
                return;
1339
            case PIX_FMT_RGB565:
1340
                __asm__ volatile(
1341
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1342
                "mov        %4, %%"REG_b"               \n\t"
1343
                "push %%"REG_BP"                        \n\t"
1344
                YSCALEYUV2RGB(%%REGBP, %5)
1345
                "pxor    %%mm7, %%mm7                   \n\t"
1346
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1347
#ifdef DITHER1XBPP
1348
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1349
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1350
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1351
#endif
1352

    
1353
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1354
                "pop %%"REG_BP"                         \n\t"
1355
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1356
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357
                "a" (&c->redDither)
1358
                );
1359
                return;
1360
            case PIX_FMT_YUYV422:
1361
                __asm__ volatile(
1362
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1363
                "mov %4, %%"REG_b"                        \n\t"
1364
                "push %%"REG_BP"                        \n\t"
1365
                YSCALEYUV2PACKED(%%REGBP, %5)
1366
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1367
                "pop %%"REG_BP"                         \n\t"
1368
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1369
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1370
                "a" (&c->redDither)
1371
                );
1372
                return;
1373
            default: break;
1374
        }
1375
    }
1376
#endif //HAVE_MMX
1377
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1378
}
1379

    
1380
/**
1381
 * YV12 to RGB without scaling or interpolating
1382
 */
1383
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1384
                          uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1385
{
1386
    const int yalpha1=0;
1387
    int i;
1388

    
1389
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1390
    const int yalpha= 4096; //FIXME ...
1391

    
1392
    if (flags&SWS_FULL_CHR_H_INT)
1393
    {
1394
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1395
        return;
1396
    }
1397

    
1398
#if HAVE_MMX
1399
    if(!(flags & SWS_BITEXACT)){
1400
        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1401
        {
1402
            switch(dstFormat)
1403
            {
1404
            case PIX_FMT_RGB32:
1405
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1406
                    __asm__ volatile(
1407
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1408
                    "mov        %4, %%"REG_b"               \n\t"
1409
                    "push %%"REG_BP"                        \n\t"
1410
                    YSCALEYUV2RGB1(%%REGBP, %5)
1411
                    YSCALEYUV2RGB1_ALPHA(%%REGBP)
1412
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1413
                    "pop %%"REG_BP"                         \n\t"
1414
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1415

    
1416
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1417
                    "a" (&c->redDither)
1418
                    );
1419
                }else{
1420
                    __asm__ volatile(
1421
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1422
                    "mov        %4, %%"REG_b"               \n\t"
1423
                    "push %%"REG_BP"                        \n\t"
1424
                    YSCALEYUV2RGB1(%%REGBP, %5)
1425
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1426
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1427
                    "pop %%"REG_BP"                         \n\t"
1428
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1429

    
1430
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431
                    "a" (&c->redDither)
1432
                    );
1433
                }
1434
                return;
1435
            case PIX_FMT_BGR24:
1436
                __asm__ volatile(
1437
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1438
                "mov        %4, %%"REG_b"               \n\t"
1439
                "push %%"REG_BP"                        \n\t"
1440
                YSCALEYUV2RGB1(%%REGBP, %5)
1441
                "pxor    %%mm7, %%mm7                   \n\t"
1442
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1443
                "pop %%"REG_BP"                         \n\t"
1444
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1445

    
1446
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447
                "a" (&c->redDither)
1448
                );
1449
                return;
1450
            case PIX_FMT_RGB555:
1451
                __asm__ volatile(
1452
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1453
                "mov        %4, %%"REG_b"               \n\t"
1454
                "push %%"REG_BP"                        \n\t"
1455
                YSCALEYUV2RGB1(%%REGBP, %5)
1456
                "pxor    %%mm7, %%mm7                   \n\t"
1457
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1458
#ifdef DITHER1XBPP
1459
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1460
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1461
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1462
#endif
1463
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1464
                "pop %%"REG_BP"                         \n\t"
1465
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1466

    
1467
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1468
                "a" (&c->redDither)
1469
                );
1470
                return;
1471
            case PIX_FMT_RGB565:
1472
                __asm__ volatile(
1473
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1474
                "mov        %4, %%"REG_b"               \n\t"
1475
                "push %%"REG_BP"                        \n\t"
1476
                YSCALEYUV2RGB1(%%REGBP, %5)
1477
                "pxor    %%mm7, %%mm7                   \n\t"
1478
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1479
#ifdef DITHER1XBPP
1480
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1481
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1482
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1483
#endif
1484

    
1485
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1486
                "pop %%"REG_BP"                         \n\t"
1487
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1488

    
1489
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1490
                "a" (&c->redDither)
1491
                );
1492
                return;
1493
            case PIX_FMT_YUYV422:
1494
                __asm__ volatile(
1495
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1496
                "mov        %4, %%"REG_b"               \n\t"
1497
                "push %%"REG_BP"                        \n\t"
1498
                YSCALEYUV2PACKED1(%%REGBP, %5)
1499
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1500
                "pop %%"REG_BP"                         \n\t"
1501
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1502

    
1503
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1504
                "a" (&c->redDither)
1505
                );
1506
                return;
1507
            }
1508
        }
1509
        else
1510
        {
1511
            switch(dstFormat)
1512
            {
1513
            case PIX_FMT_RGB32:
1514
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1515
                    __asm__ volatile(
1516
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1517
                    "mov        %4, %%"REG_b"               \n\t"
1518
                    "push %%"REG_BP"                        \n\t"
1519
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1520
                    YSCALEYUV2RGB1_ALPHA(%%REGBP)
1521
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1522
                    "pop %%"REG_BP"                         \n\t"
1523
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1524

    
1525
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526
                    "a" (&c->redDither)
1527
                    );
1528
                }else{
1529
                    __asm__ volatile(
1530
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1531
                    "mov        %4, %%"REG_b"               \n\t"
1532
                    "push %%"REG_BP"                        \n\t"
1533
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1534
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1535
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1536
                    "pop %%"REG_BP"                         \n\t"
1537
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1538

    
1539
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540
                    "a" (&c->redDither)
1541
                    );
1542
                }
1543
                return;
1544
            case PIX_FMT_BGR24:
1545
                __asm__ volatile(
1546
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1547
                "mov        %4, %%"REG_b"               \n\t"
1548
                "push %%"REG_BP"                        \n\t"
1549
                YSCALEYUV2RGB1b(%%REGBP, %5)
1550
                "pxor    %%mm7, %%mm7                   \n\t"
1551
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1552
                "pop %%"REG_BP"                         \n\t"
1553
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1554

    
1555
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1556
                "a" (&c->redDither)
1557
                );
1558
                return;
1559
            case PIX_FMT_RGB555:
1560
                __asm__ volatile(
1561
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1562
                "mov        %4, %%"REG_b"               \n\t"
1563
                "push %%"REG_BP"                        \n\t"
1564
                YSCALEYUV2RGB1b(%%REGBP, %5)
1565
                "pxor    %%mm7, %%mm7                   \n\t"
1566
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1567
#ifdef DITHER1XBPP
1568
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1569
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1570
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1571
#endif
1572
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1573
                "pop %%"REG_BP"                         \n\t"
1574
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1575

    
1576
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1577
                "a" (&c->redDither)
1578
                );
1579
                return;
1580
            case PIX_FMT_RGB565:
1581
                __asm__ volatile(
1582
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1583
                "mov        %4, %%"REG_b"               \n\t"
1584
                "push %%"REG_BP"                        \n\t"
1585
                YSCALEYUV2RGB1b(%%REGBP, %5)
1586
                "pxor    %%mm7, %%mm7                   \n\t"
1587
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1588
#ifdef DITHER1XBPP
1589
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1590
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1591
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1592
#endif
1593

    
1594
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1595
                "pop %%"REG_BP"                         \n\t"
1596
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1597

    
1598
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1599
                "a" (&c->redDither)
1600
                );
1601
                return;
1602
            case PIX_FMT_YUYV422:
1603
                __asm__ volatile(
1604
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1605
                "mov        %4, %%"REG_b"               \n\t"
1606
                "push %%"REG_BP"                        \n\t"
1607
                YSCALEYUV2PACKED1b(%%REGBP, %5)
1608
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1609
                "pop %%"REG_BP"                         \n\t"
1610
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1611

    
1612
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1613
                "a" (&c->redDither)
1614
                );
1615
                return;
1616
            }
1617
        }
1618
    }
1619
#endif /* HAVE_MMX */
1620
    if (uvalpha < 2048)
1621
    {
1622
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1623
    }else{
1624
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1625
    }
1626
}
1627

    
1628
//FIXME yuy2* can read up to 7 samples too much
1629

    
1630
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1631
{
1632
#if HAVE_MMX
1633
    __asm__ volatile(
1634
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1635
    "mov                    %0, %%"REG_a"       \n\t"
1636
    "1:                                         \n\t"
1637
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1638
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1639
    "pand                %%mm2, %%mm0           \n\t"
1640
    "pand                %%mm2, %%mm1           \n\t"
1641
    "packuswb            %%mm1, %%mm0           \n\t"
1642
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1643
    "add                    $8, %%"REG_a"       \n\t"
1644
    " js                    1b                  \n\t"
1645
    : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1646
    : "%"REG_a
1647
    );
1648
#else
1649
    int i;
1650
    for (i=0; i<width; i++)
1651
        dst[i]= src[2*i];
1652
#endif
1653
}
1654

    
1655
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1656
{
1657
#if HAVE_MMX
1658
    __asm__ volatile(
1659
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1660
    "mov                    %0, %%"REG_a"       \n\t"
1661
    "1:                                         \n\t"
1662
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1663
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1664
    "psrlw                  $8, %%mm0           \n\t"
1665
    "psrlw                  $8, %%mm1           \n\t"
1666
    "packuswb            %%mm1, %%mm0           \n\t"
1667
    "movq                %%mm0, %%mm1           \n\t"
1668
    "psrlw                  $8, %%mm0           \n\t"
1669
    "pand                %%mm4, %%mm1           \n\t"
1670
    "packuswb            %%mm0, %%mm0           \n\t"
1671
    "packuswb            %%mm1, %%mm1           \n\t"
1672
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1673
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1674
    "add                    $4, %%"REG_a"       \n\t"
1675
    " js                    1b                  \n\t"
1676
    : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1677
    : "%"REG_a
1678
    );
1679
#else
1680
    int i;
1681
    for (i=0; i<width; i++)
1682
    {
1683
        dstU[i]= src1[4*i + 1];
1684
        dstV[i]= src1[4*i + 3];
1685
    }
1686
#endif
1687
    assert(src1 == src2);
1688
}
1689

    
1690
/* This is almost identical to the previous, end exists only because
1691
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1693
{
1694
#if HAVE_MMX
1695
    __asm__ volatile(
1696
    "mov                  %0, %%"REG_a"         \n\t"
1697
    "1:                                         \n\t"
1698
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1699
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1700
    "psrlw                $8, %%mm0             \n\t"
1701
    "psrlw                $8, %%mm1             \n\t"
1702
    "packuswb          %%mm1, %%mm0             \n\t"
1703
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1704
    "add                  $8, %%"REG_a"         \n\t"
1705
    " js                  1b                    \n\t"
1706
    : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707
    : "%"REG_a
1708
    );
1709
#else
1710
    int i;
1711
    for (i=0; i<width; i++)
1712
        dst[i]= src[2*i+1];
1713
#endif
1714
}
1715

    
1716
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1717
{
1718
#if HAVE_MMX
1719
    __asm__ volatile(
1720
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1721
    "mov                    %0, %%"REG_a"       \n\t"
1722
    "1:                                         \n\t"
1723
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1724
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1725
    "pand                %%mm4, %%mm0           \n\t"
1726
    "pand                %%mm4, %%mm1           \n\t"
1727
    "packuswb            %%mm1, %%mm0           \n\t"
1728
    "movq                %%mm0, %%mm1           \n\t"
1729
    "psrlw                  $8, %%mm0           \n\t"
1730
    "pand                %%mm4, %%mm1           \n\t"
1731
    "packuswb            %%mm0, %%mm0           \n\t"
1732
    "packuswb            %%mm1, %%mm1           \n\t"
1733
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1734
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1735
    "add                    $4, %%"REG_a"       \n\t"
1736
    " js                    1b                  \n\t"
1737
    : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738
    : "%"REG_a
1739
    );
1740
#else
1741
    int i;
1742
    for (i=0; i<width; i++)
1743
    {
1744
        dstU[i]= src1[4*i + 0];
1745
        dstV[i]= src1[4*i + 2];
1746
    }
1747
#endif
1748
    assert(src1 == src2);
1749
}
1750

    
1751
#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1752
static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1753
{\
1754
    int i;\
1755
    for (i=0; i<width; i++)\
1756
    {\
1757
        int b= (((type*)src)[i]>>shb)&maskb;\
1758
        int g= (((type*)src)[i]>>shg)&maskg;\
1759
        int r= (((type*)src)[i]>>shr)&maskr;\
1760
\
1761
        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1762
    }\
1763
}
1764

    
1765
BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1766
BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1767
BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1768
BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1769
BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1770
BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1771

    
1772
static inline void RENAME(abgrToA)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused){
1773
    int i;
1774
    for (i=0; i<width; i++){
1775
        dst[i]= src[4*i];
1776
    }
1777
}
1778

    
1779
#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1780
static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1781
{\
1782
    int i;\
1783
    for (i=0; i<width; i++)\
1784
    {\
1785
        int b= (((type*)src)[i]&maskb)>>shb;\
1786
        int g= (((type*)src)[i]&maskg)>>shg;\
1787
        int r= (((type*)src)[i]&maskr)>>shr;\
1788
\
1789
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1790
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1791
    }\
1792
}\
1793
static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1794
{\
1795
    int i;\
1796
    for (i=0; i<width; i++)\
1797
    {\
1798
        int pix0= ((type*)src)[2*i+0];\
1799
        int pix1= ((type*)src)[2*i+1];\
1800
        int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1801
        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1802
        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1803
        g&= maskg|(2*maskg);\
1804
\
1805
        g>>=shg;\
1806
\
1807
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1808
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1809
    }\
1810
}
1811

    
1812
BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1813
BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1814
BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1815
BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1816
BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1817
BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1818

    
1819
#if HAVE_MMX
1820
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1821
{
1822

    
1823
    if(srcFormat == PIX_FMT_BGR24){
1824
        __asm__ volatile(
1825
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1826
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1827
            :
1828
        );
1829
    }else{
1830
        __asm__ volatile(
1831
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1832
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1833
            :
1834
        );
1835
    }
1836

    
1837
    __asm__ volatile(
1838
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1839
        "mov                        %2, %%"REG_a"   \n\t"
1840
        "pxor                    %%mm7, %%mm7       \n\t"
1841
        "1:                                         \n\t"
1842
        PREFETCH"               64(%0)              \n\t"
1843
        "movd                     (%0), %%mm0       \n\t"
1844
        "movd                    2(%0), %%mm1       \n\t"
1845
        "movd                    6(%0), %%mm2       \n\t"
1846
        "movd                    8(%0), %%mm3       \n\t"
1847
        "add                       $12, %0          \n\t"
1848
        "punpcklbw               %%mm7, %%mm0       \n\t"
1849
        "punpcklbw               %%mm7, %%mm1       \n\t"
1850
        "punpcklbw               %%mm7, %%mm2       \n\t"
1851
        "punpcklbw               %%mm7, %%mm3       \n\t"
1852
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1853
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1854
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1855
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1856
        "paddd                   %%mm1, %%mm0       \n\t"
1857
        "paddd                   %%mm3, %%mm2       \n\t"
1858
        "paddd                   %%mm4, %%mm0       \n\t"
1859
        "paddd                   %%mm4, %%mm2       \n\t"
1860
        "psrad                     $15, %%mm0       \n\t"
1861
        "psrad                     $15, %%mm2       \n\t"
1862
        "packssdw                %%mm2, %%mm0       \n\t"
1863
        "packuswb                %%mm0, %%mm0       \n\t"
1864
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1865
        "add                        $4, %%"REG_a"   \n\t"
1866
        " js                        1b              \n\t"
1867
    : "+r" (src)
1868
    : "r" (dst+width), "g" ((x86_reg)-width)
1869
    : "%"REG_a
1870
    );
1871
}
1872

    
1873
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1874
{
1875
    __asm__ volatile(
1876
        "movq                    24+%4, %%mm6       \n\t"
1877
        "mov                        %3, %%"REG_a"   \n\t"
1878
        "pxor                    %%mm7, %%mm7       \n\t"
1879
        "1:                                         \n\t"
1880
        PREFETCH"               64(%0)              \n\t"
1881
        "movd                     (%0), %%mm0       \n\t"
1882
        "movd                    2(%0), %%mm1       \n\t"
1883
        "punpcklbw               %%mm7, %%mm0       \n\t"
1884
        "punpcklbw               %%mm7, %%mm1       \n\t"
1885
        "movq                    %%mm0, %%mm2       \n\t"
1886
        "movq                    %%mm1, %%mm3       \n\t"
1887
        "pmaddwd                    %4, %%mm0       \n\t"
1888
        "pmaddwd                  8+%4, %%mm1       \n\t"
1889
        "pmaddwd                 16+%4, %%mm2       \n\t"
1890
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1891
        "paddd                   %%mm1, %%mm0       \n\t"
1892
        "paddd                   %%mm3, %%mm2       \n\t"
1893

    
1894
        "movd                    6(%0), %%mm1       \n\t"
1895
        "movd                    8(%0), %%mm3       \n\t"
1896
        "add                       $12, %0          \n\t"
1897
        "punpcklbw               %%mm7, %%mm1       \n\t"
1898
        "punpcklbw               %%mm7, %%mm3       \n\t"
1899
        "movq                    %%mm1, %%mm4       \n\t"
1900
        "movq                    %%mm3, %%mm5       \n\t"
1901
        "pmaddwd                    %4, %%mm1       \n\t"
1902
        "pmaddwd                  8+%4, %%mm3       \n\t"
1903
        "pmaddwd                 16+%4, %%mm4       \n\t"
1904
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1905
        "paddd                   %%mm3, %%mm1       \n\t"
1906
        "paddd                   %%mm5, %%mm4       \n\t"
1907

    
1908
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1909
        "paddd                   %%mm3, %%mm0       \n\t"
1910
        "paddd                   %%mm3, %%mm2       \n\t"
1911
        "paddd                   %%mm3, %%mm1       \n\t"
1912
        "paddd                   %%mm3, %%mm4       \n\t"
1913
        "psrad                     $15, %%mm0       \n\t"
1914
        "psrad                     $15, %%mm2       \n\t"
1915
        "psrad                     $15, %%mm1       \n\t"
1916
        "psrad                     $15, %%mm4       \n\t"
1917
        "packssdw                %%mm1, %%mm0       \n\t"
1918
        "packssdw                %%mm4, %%mm2       \n\t"
1919
        "packuswb                %%mm0, %%mm0       \n\t"
1920
        "packuswb                %%mm2, %%mm2       \n\t"
1921
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1922
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1923
        "add                        $4, %%"REG_a"   \n\t"
1924
        " js                        1b              \n\t"
1925
    : "+r" (src)
1926
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1927
    : "%"REG_a
1928
    );
1929
}
1930
#endif
1931

    
1932
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1933
{
1934
#if HAVE_MMX
1935
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1936
#else
1937
    int i;
1938
    for (i=0; i<width; i++)
1939
    {
1940
        int b= src[i*3+0];
1941
        int g= src[i*3+1];
1942
        int r= src[i*3+2];
1943

    
1944
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1945
    }
1946
#endif /* HAVE_MMX */
1947
}
1948

    
1949
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1950
{
1951
#if HAVE_MMX
1952
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1953
#else
1954
    int i;
1955
    for (i=0; i<width; i++)
1956
    {
1957
        int b= src1[3*i + 0];
1958
        int g= src1[3*i + 1];
1959
        int r= src1[3*i + 2];
1960

    
1961
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1962
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1963
    }
1964
#endif /* HAVE_MMX */
1965
    assert(src1 == src2);
1966
}
1967

    
1968
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1969
{
1970
    int i;
1971
    for (i=0; i<width; i++)
1972
    {
1973
        int b= src1[6*i + 0] + src1[6*i + 3];
1974
        int g= src1[6*i + 1] + src1[6*i + 4];
1975
        int r= src1[6*i + 2] + src1[6*i + 5];
1976

    
1977
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1978
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1979
    }
1980
    assert(src1 == src2);
1981
}
1982

    
1983
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1984
{
1985
#if HAVE_MMX
1986
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1987
#else
1988
    int i;
1989
    for (i=0; i<width; i++)
1990
    {
1991
        int r= src[i*3+0];
1992
        int g= src[i*3+1];
1993
        int b= src[i*3+2];
1994

    
1995
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1996
    }
1997
#endif
1998
}
1999

    
2000
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2001
{
2002
#if HAVE_MMX
2003
    assert(src1==src2);
2004
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2005
#else
2006
    int i;
2007
    assert(src1==src2);
2008
    for (i=0; i<width; i++)
2009
    {
2010
        int r= src1[3*i + 0];
2011
        int g= src1[3*i + 1];
2012
        int b= src1[3*i + 2];
2013

    
2014
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2015
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2016
    }
2017
#endif
2018
}
2019

    
2020
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2021
{
2022
    int i;
2023
    assert(src1==src2);
2024
    for (i=0; i<width; i++)
2025
    {
2026
        int r= src1[6*i + 0] + src1[6*i + 3];
2027
        int g= src1[6*i + 1] + src1[6*i + 4];
2028
        int b= src1[6*i + 2] + src1[6*i + 5];
2029

    
2030
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2031
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2032
    }
2033
}
2034

    
2035

    
2036
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2037
{
2038
    int i;
2039
    for (i=0; i<width; i++)
2040
    {
2041
        int d= src[i];
2042

    
2043
        dst[i]= pal[d] & 0xFF;
2044
    }
2045
}
2046

    
2047
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2048
{
2049
    int i;
2050
    assert(src1 == src2);
2051
    for (i=0; i<width; i++)
2052
    {
2053
        int p= pal[src1[i]];
2054

    
2055
        dstU[i]= p>>8;
2056
        dstV[i]= p>>16;
2057
    }
2058
}
2059

    
2060
static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2061
{
2062
    int i, j;
2063
    for (i=0; i<width/8; i++){
2064
        int d= ~src[i];
2065
        for(j=0; j<8; j++)
2066
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2067
    }
2068
}
2069

    
2070
static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2071
{
2072
    int i, j;
2073
    for (i=0; i<width/8; i++){
2074
        int d= src[i];
2075
        for(j=0; j<8; j++)
2076
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2077
    }
2078
}
2079

    
2080
// bilinear / bicubic scaling
2081
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2082
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2083
{
2084
#if HAVE_MMX
2085
    assert(filterSize % 4 == 0 && filterSize>0);
2086
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2087
    {
2088
        x86_reg counter= -2*dstW;
2089
        filter-= counter*2;
2090
        filterPos-= counter/2;
2091
        dst-= counter/2;
2092
        __asm__ volatile(
2093
#if defined(PIC)
2094
        "push            %%"REG_b"              \n\t"
2095
#endif
2096
        "pxor                %%mm7, %%mm7       \n\t"
2097
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2098
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2099
        ASMALIGN(4)
2100
        "1:                                     \n\t"
2101
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2102
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2103
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2104
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2105
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2106
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2107
        "punpcklbw           %%mm7, %%mm0       \n\t"
2108
        "punpcklbw           %%mm7, %%mm2       \n\t"
2109
        "pmaddwd             %%mm1, %%mm0       \n\t"
2110
        "pmaddwd             %%mm2, %%mm3       \n\t"
2111
        "movq                %%mm0, %%mm4       \n\t"
2112
        "punpckldq           %%mm3, %%mm0       \n\t"
2113
        "punpckhdq           %%mm3, %%mm4       \n\t"
2114
        "paddd               %%mm4, %%mm0       \n\t"
2115
        "psrad                  $7, %%mm0       \n\t"
2116
        "packssdw            %%mm0, %%mm0       \n\t"
2117
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2118
        "add                    $4, %%"REG_BP"  \n\t"
2119
        " jnc                   1b              \n\t"
2120

    
2121
        "pop            %%"REG_BP"              \n\t"
2122
#if defined(PIC)
2123
        "pop             %%"REG_b"              \n\t"
2124
#endif
2125
        : "+a" (counter)
2126
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2127
#if !defined(PIC)
2128
        : "%"REG_b
2129
#endif
2130
        );
2131
    }
2132
    else if (filterSize==8)
2133
    {
2134
        x86_reg counter= -2*dstW;
2135
        filter-= counter*4;
2136
        filterPos-= counter/2;
2137
        dst-= counter/2;
2138
        __asm__ volatile(
2139
#if defined(PIC)
2140
        "push             %%"REG_b"             \n\t"
2141
#endif
2142
        "pxor                 %%mm7, %%mm7      \n\t"
2143
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2144
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2145
        ASMALIGN(4)
2146
        "1:                                     \n\t"
2147
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2148
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2149
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2150
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2151
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2152
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2153
        "punpcklbw            %%mm7, %%mm0      \n\t"
2154
        "punpcklbw            %%mm7, %%mm2      \n\t"
2155
        "pmaddwd              %%mm1, %%mm0      \n\t"
2156
        "pmaddwd              %%mm2, %%mm3      \n\t"
2157

    
2158
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2159
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2160
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2161
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2162
        "punpcklbw            %%mm7, %%mm4      \n\t"
2163
        "punpcklbw            %%mm7, %%mm2      \n\t"
2164
        "pmaddwd              %%mm1, %%mm4      \n\t"
2165
        "pmaddwd              %%mm2, %%mm5      \n\t"
2166
        "paddd                %%mm4, %%mm0      \n\t"
2167
        "paddd                %%mm5, %%mm3      \n\t"
2168
        "movq                 %%mm0, %%mm4      \n\t"
2169
        "punpckldq            %%mm3, %%mm0      \n\t"
2170
        "punpckhdq            %%mm3, %%mm4      \n\t"
2171
        "paddd                %%mm4, %%mm0      \n\t"
2172
        "psrad                   $7, %%mm0      \n\t"
2173
        "packssdw             %%mm0, %%mm0      \n\t"
2174
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2175
        "add                     $4, %%"REG_BP" \n\t"
2176
        " jnc                    1b             \n\t"
2177

    
2178
        "pop             %%"REG_BP"             \n\t"
2179
#if defined(PIC)
2180
        "pop              %%"REG_b"             \n\t"
2181
#endif
2182
        : "+a" (counter)
2183
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2184
#if !defined(PIC)
2185
        : "%"REG_b
2186
#endif
2187
        );
2188
    }
2189
    else
2190
    {
2191
        uint8_t *offset = src+filterSize;
2192
        x86_reg counter= -2*dstW;
2193
        //filter-= counter*filterSize/2;
2194
        filterPos-= counter/2;
2195
        dst-= counter/2;
2196
        __asm__ volatile(
2197
        "pxor                  %%mm7, %%mm7     \n\t"
2198
        ASMALIGN(4)
2199
        "1:                                     \n\t"
2200
        "mov                      %2, %%"REG_c" \n\t"
2201
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2202
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2203
        "mov                      %5, %%"REG_c" \n\t"
2204
        "pxor                  %%mm4, %%mm4     \n\t"
2205
        "pxor                  %%mm5, %%mm5     \n\t"
2206
        "2:                                     \n\t"
2207
        "movq                   (%1), %%mm1     \n\t"
2208
        "movq               (%1, %6), %%mm3     \n\t"
2209
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2210
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2211
        "punpcklbw             %%mm7, %%mm0     \n\t"
2212
        "punpcklbw             %%mm7, %%mm2     \n\t"
2213
        "pmaddwd               %%mm1, %%mm0     \n\t"
2214
        "pmaddwd               %%mm2, %%mm3     \n\t"
2215
        "paddd                 %%mm3, %%mm5     \n\t"
2216
        "paddd                 %%mm0, %%mm4     \n\t"
2217
        "add                      $8, %1        \n\t"
2218
        "add                      $4, %%"REG_c" \n\t"
2219
        "cmp                      %4, %%"REG_c" \n\t"
2220
        " jb                      2b            \n\t"
2221
        "add                      %6, %1        \n\t"
2222
        "movq                  %%mm4, %%mm0     \n\t"
2223
        "punpckldq             %%mm5, %%mm4     \n\t"
2224
        "punpckhdq             %%mm5, %%mm0     \n\t"
2225
        "paddd                 %%mm0, %%mm4     \n\t"
2226
        "psrad                    $7, %%mm4     \n\t"
2227
        "packssdw              %%mm4, %%mm4     \n\t"
2228
        "mov                      %3, %%"REG_a" \n\t"
2229
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2230
        "add                      $4, %0        \n\t"
2231
        " jnc                     1b            \n\t"
2232

    
2233
        : "+r" (counter), "+r" (filter)
2234
        : "m" (filterPos), "m" (dst), "m"(offset),
2235
          "m" (src), "r" ((x86_reg)filterSize*2)
2236
        : "%"REG_a, "%"REG_c, "%"REG_d
2237
        );
2238
    }
2239
#else
2240
#if HAVE_ALTIVEC
2241
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2242
#else
2243
    int i;
2244
    for (i=0; i<dstW; i++)
2245
    {
2246
        int j;
2247
        int srcPos= filterPos[i];
2248
        int val=0;
2249
        //printf("filterPos: %d\n", filterPos[i]);
2250
        for (j=0; j<filterSize; j++)
2251
        {
2252
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2253
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2254
        }
2255
        //filter += hFilterSize;
2256
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2257
        //dst[i] = val>>7;
2258
    }
2259
#endif /* HAVE_ALTIVEC */
2260
#endif /* HAVE_MMX */
2261
}
2262
      // *** horizontal scale Y line to temp buffer
2263
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2264
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2265
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2266
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2267
                                   int32_t *mmx2FilterPos, uint32_t *pal, int isAlpha)
2268
{
2269
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2270
    {
2271
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2272
        src= formatConvBuffer;
2273
    }
2274
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2275
    {
2276
        RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2277
        src= formatConvBuffer;
2278
    }
2279
    else if (srcFormat==PIX_FMT_RGB32)
2280
    {
2281
        if (isAlpha)
2282
            RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2283
        else
2284
            RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2285
        src= formatConvBuffer;
2286
    }
2287
    else if (srcFormat==PIX_FMT_RGB32_1)
2288
    {
2289
        if (isAlpha)
2290
            RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2291
        else
2292
            RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2293
        src= formatConvBuffer;
2294
    }
2295
    else if (srcFormat==PIX_FMT_BGR24)
2296
    {
2297
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2298
        src= formatConvBuffer;
2299
    }
2300
    else if (srcFormat==PIX_FMT_BGR565)
2301
    {
2302
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2303
        src= formatConvBuffer;
2304
    }
2305
    else if (srcFormat==PIX_FMT_BGR555)
2306
    {
2307
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2308
        src= formatConvBuffer;
2309
    }
2310
    else if (srcFormat==PIX_FMT_BGR32)
2311
    {
2312
        if (isAlpha)
2313
            RENAME(abgrToA)(formatConvBuffer, src+3, srcW, pal);
2314
        else
2315
            RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2316
        src= formatConvBuffer;
2317
    }
2318
    else if (srcFormat==PIX_FMT_BGR32_1)
2319
    {
2320
        if (isAlpha)
2321
            RENAME(abgrToA)(formatConvBuffer, src, srcW, pal);
2322
        else
2323
            RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2324
        src= formatConvBuffer;
2325
    }
2326
    else if (srcFormat==PIX_FMT_RGB24)
2327
    {
2328
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2329
        src= formatConvBuffer;
2330
    }
2331
    else if (srcFormat==PIX_FMT_RGB565)
2332
    {
2333
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2334
        src= formatConvBuffer;
2335
    }
2336
    else if (srcFormat==PIX_FMT_RGB555)
2337
    {
2338
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2339
        src= formatConvBuffer;
2340
    }
2341
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2342
    {
2343
        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2344
        src= formatConvBuffer;
2345
    }
2346
    else if (srcFormat==PIX_FMT_MONOBLACK)
2347
    {
2348
        RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2349
        src= formatConvBuffer;
2350
    }
2351
    else if (srcFormat==PIX_FMT_MONOWHITE)
2352
    {
2353
        RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2354
        src= formatConvBuffer;
2355
    }
2356

    
2357
#if HAVE_MMX
2358
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2359
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2360
#else
2361
    if (!(flags&SWS_FAST_BILINEAR))
2362
#endif
2363
    {
2364
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2365
    }
2366
    else // fast bilinear upscale / crap downscale
2367
    {
2368
#if ARCH_X86 && CONFIG_GPL
2369
#if HAVE_MMX2
2370
        int i;
2371
#if defined(PIC)
2372
        uint64_t ebxsave __attribute__((aligned(8)));
2373
#endif
2374
        if (canMMX2BeUsed)
2375
        {
2376
            __asm__ volatile(
2377
#if defined(PIC)
2378
            "mov               %%"REG_b", %5        \n\t"
2379
#endif
2380
            "pxor                  %%mm7, %%mm7     \n\t"
2381
            "mov                      %0, %%"REG_c" \n\t"
2382
            "mov                      %1, %%"REG_D" \n\t"
2383
            "mov                      %2, %%"REG_d" \n\t"
2384
            "mov                      %3, %%"REG_b" \n\t"
2385
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2386
            PREFETCH"        (%%"REG_c")            \n\t"
2387
            PREFETCH"      32(%%"REG_c")            \n\t"
2388
            PREFETCH"      64(%%"REG_c")            \n\t"
2389

    
2390
#if ARCH_X86_64
2391

    
2392
#define FUNNY_Y_CODE \
2393
            "movl            (%%"REG_b"), %%esi     \n\t"\
2394
            "call                    *%4            \n\t"\
2395
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2396
            "add               %%"REG_S", %%"REG_c" \n\t"\
2397
            "add               %%"REG_a", %%"REG_D" \n\t"\
2398
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2399

    
2400
#else
2401

    
2402
#define FUNNY_Y_CODE \
2403
            "movl (%%"REG_b"), %%esi        \n\t"\
2404
            "call         *%4                       \n\t"\
2405
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2406
            "add               %%"REG_a", %%"REG_D" \n\t"\
2407
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2408

    
2409
#endif /* ARCH_X86_64 */
2410

    
2411
FUNNY_Y_CODE
2412
FUNNY_Y_CODE
2413
FUNNY_Y_CODE
2414
FUNNY_Y_CODE
2415
FUNNY_Y_CODE
2416
FUNNY_Y_CODE
2417
FUNNY_Y_CODE
2418
FUNNY_Y_CODE
2419

    
2420
#if defined(PIC)
2421
            "mov                      %5, %%"REG_b" \n\t"
2422
#endif
2423
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2424
            "m" (funnyYCode)
2425
#if defined(PIC)
2426
            ,"m" (ebxsave)
2427
#endif
2428
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2429
#if !defined(PIC)
2430
            ,"%"REG_b
2431
#endif
2432
            );
2433
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2434
        }
2435
        else
2436
        {
2437
#endif /* HAVE_MMX2 */
2438
        x86_reg xInc_shr16 = xInc >> 16;
2439
        uint16_t xInc_mask = xInc & 0xffff;
2440
        //NO MMX just normal asm ...
2441
        __asm__ volatile(
2442
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2443
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2444
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2445
        ASMALIGN(4)
2446
        "1:                                  \n\t"
2447
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2448
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2449
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2450
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2451
        "shll      $16, %%edi                \n\t"
2452
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2453
        "mov        %1, %%"REG_D"            \n\t"
2454
        "shrl       $9, %%esi                \n\t"
2455
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2456
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2457
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2458

    
2459
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2460
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2461
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2462
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2463
        "shll      $16, %%edi                \n\t"
2464
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2465
        "mov        %1, %%"REG_D"            \n\t"
2466
        "shrl       $9, %%esi                \n\t"
2467
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2468
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2469
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2470

    
2471

    
2472
        "add        $2, %%"REG_a"            \n\t"
2473
        "cmp        %2, %%"REG_a"            \n\t"
2474
        " jb        1b                       \n\t"
2475

    
2476

    
2477
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2478
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2479
        );
2480
#if HAVE_MMX2
2481
        } //if MMX2 can't be used
2482
#endif
2483
#else
2484
        int i;
2485
        unsigned int xpos=0;
2486
        for (i=0;i<dstWidth;i++)
2487
        {
2488
            register unsigned int xx=xpos>>16;
2489
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2490
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2491
            xpos+=xInc;
2492
        }
2493
#endif /* ARCH_X86 */
2494
    }
2495

    
2496
    if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2497
        int i;
2498
        //FIXME all pal and rgb srcFormats could do this convertion as well
2499
        //FIXME all scalers more complex than bilinear could do half of this transform
2500
        if(c->srcRange){
2501
            for (i=0; i<dstWidth; i++)
2502
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2503
        }else{
2504
            for (i=0; i<dstWidth; i++)
2505
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2506
        }
2507
    }
2508
}
2509

    
2510
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2511
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2512
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2513
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2514
                                   int32_t *mmx2FilterPos, uint32_t *pal)
2515
{
2516
    if (srcFormat==PIX_FMT_YUYV422)
2517
    {
2518
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2519
        src1= formatConvBuffer;
2520
        src2= formatConvBuffer+VOFW;
2521
    }
2522
    else if (srcFormat==PIX_FMT_UYVY422)
2523
    {
2524
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2525
        src1= formatConvBuffer;
2526
        src2= formatConvBuffer+VOFW;
2527
    }
2528
    else if (srcFormat==PIX_FMT_RGB32)
2529
    {
2530
        if(c->chrSrcHSubSample)
2531
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2532
        else
2533
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2534
        src1= formatConvBuffer;
2535
        src2= formatConvBuffer+VOFW;
2536
    }
2537
    else if (srcFormat==PIX_FMT_RGB32_1)
2538
    {
2539
        if(c->chrSrcHSubSample)
2540
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2541
        else
2542
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2543
        src1= formatConvBuffer;
2544
        src2= formatConvBuffer+VOFW;
2545
    }
2546
    else if (srcFormat==PIX_FMT_BGR24)
2547
    {
2548
        if(c->chrSrcHSubSample)
2549
            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2550
        else
2551
            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2552
        src1= formatConvBuffer;
2553
        src2= formatConvBuffer+VOFW;
2554
    }
2555
    else if (srcFormat==PIX_FMT_BGR565)
2556
    {
2557
        if(c->chrSrcHSubSample)
2558
            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2559
        else
2560
            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2561
        src1= formatConvBuffer;
2562
        src2= formatConvBuffer+VOFW;
2563
    }
2564
    else if (srcFormat==PIX_FMT_BGR555)
2565
    {
2566
        if(c->chrSrcHSubSample)
2567
            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2568
        else
2569
            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2570
        src1= formatConvBuffer;
2571
        src2= formatConvBuffer+VOFW;
2572
    }
2573
    else if (srcFormat==PIX_FMT_BGR32)
2574
    {
2575
        if(c->chrSrcHSubSample)
2576
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2577
        else
2578
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2579
        src1= formatConvBuffer;
2580
        src2= formatConvBuffer+VOFW;
2581
    }
2582
    else if (srcFormat==PIX_FMT_BGR32_1)
2583
    {
2584
        if(c->chrSrcHSubSample)
2585
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2586
        else
2587
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2588
        src1= formatConvBuffer;
2589
        src2= formatConvBuffer+VOFW;
2590
    }
2591
    else if (srcFormat==PIX_FMT_RGB24)
2592
    {
2593
        if(c->chrSrcHSubSample)
2594
            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2595
        else
2596
            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2597
        src1= formatConvBuffer;
2598
        src2= formatConvBuffer+VOFW;
2599
    }
2600
    else if (srcFormat==PIX_FMT_RGB565)
2601
    {
2602
        if(c->chrSrcHSubSample)
2603
            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2604
        else
2605
            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2606
        src1= formatConvBuffer;
2607
        src2= formatConvBuffer+VOFW;
2608
    }
2609
    else if (srcFormat==PIX_FMT_RGB555)
2610
    {
2611
        if(c->chrSrcHSubSample)
2612
            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2613
        else
2614
            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2615
        src1= formatConvBuffer;
2616
        src2= formatConvBuffer+VOFW;
2617
    }
2618
    else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2619
    {
2620
        return;
2621
    }
2622
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2623
    {
2624
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2625
        src1= formatConvBuffer;
2626
        src2= formatConvBuffer+VOFW;
2627
    }
2628

    
2629
#if HAVE_MMX
2630
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2631
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2632
#else
2633
    if (!(flags&SWS_FAST_BILINEAR))
2634
#endif
2635
    {
2636
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2637
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2638
    }
2639
    else // fast bilinear upscale / crap downscale
2640
    {
2641
#if ARCH_X86 && CONFIG_GPL
2642
#if HAVE_MMX2
2643
        int i;
2644
#if defined(PIC)
2645
        uint64_t ebxsave __attribute__((aligned(8)));
2646
#endif
2647
        if (canMMX2BeUsed)
2648
        {
2649
            __asm__ volatile(
2650
#if defined(PIC)
2651
            "mov          %%"REG_b", %6         \n\t"
2652
#endif
2653
            "pxor             %%mm7, %%mm7      \n\t"
2654
            "mov                 %0, %%"REG_c"  \n\t"
2655
            "mov                 %1, %%"REG_D"  \n\t"
2656
            "mov                 %2, %%"REG_d"  \n\t"
2657
            "mov                 %3, %%"REG_b"  \n\t"
2658
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2659
            PREFETCH"   (%%"REG_c")             \n\t"
2660
            PREFETCH" 32(%%"REG_c")             \n\t"
2661
            PREFETCH" 64(%%"REG_c")             \n\t"
2662

    
2663
#if ARCH_X86_64
2664

    
2665
#define FUNNY_UV_CODE \
2666
            "movl       (%%"REG_b"), %%esi      \n\t"\
2667
            "call               *%4             \n\t"\
2668
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2669
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2670
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2671
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2672

    
2673
#else
2674

    
2675
#define FUNNY_UV_CODE \
2676
            "movl       (%%"REG_b"), %%esi      \n\t"\
2677
            "call               *%4             \n\t"\
2678
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2679
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2680
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2681

    
2682
#endif /* ARCH_X86_64 */
2683

    
2684
FUNNY_UV_CODE
2685
FUNNY_UV_CODE
2686
FUNNY_UV_CODE
2687
FUNNY_UV_CODE
2688
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2689
            "mov                 %5, %%"REG_c"  \n\t" // src
2690
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2691
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2692
            PREFETCH"   (%%"REG_c")             \n\t"
2693
            PREFETCH" 32(%%"REG_c")             \n\t"
2694
            PREFETCH" 64(%%"REG_c")             \n\t"
2695

    
2696
FUNNY_UV_CODE
2697
FUNNY_UV_CODE
2698
FUNNY_UV_CODE
2699
FUNNY_UV_CODE
2700

    
2701
#if defined(PIC)
2702
            "mov %6, %%"REG_b"    \n\t"
2703
#endif
2704
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2705
            "m" (funnyUVCode), "m" (src2)
2706
#if defined(PIC)
2707
            ,"m" (ebxsave)
2708
#endif
2709
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2710
#if !defined(PIC)
2711
             ,"%"REG_b
2712
#endif
2713
            );
2714
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2715
            {
2716
                //printf("%d %d %d\n", dstWidth, i, srcW);
2717
                dst[i] = src1[srcW-1]*128;
2718
                dst[i+VOFW] = src2[srcW-1]*128;
2719
            }
2720
        }
2721
        else
2722
        {
2723
#endif /* HAVE_MMX2 */
2724
            x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2725
            uint16_t xInc_mask = xInc & 0xffff;
2726
            __asm__ volatile(
2727
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2728
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2729
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2730
            ASMALIGN(4)
2731
            "1:                                     \n\t"
2732
            "mov        %0, %%"REG_S"               \n\t"
2733
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2734
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2735
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2736
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2737
            "shll      $16, %%edi                   \n\t"
2738
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2739
            "mov        %1, %%"REG_D"               \n\t"
2740
            "shrl       $9, %%esi                   \n\t"
2741
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2742

    
2743
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2744
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2745
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2746
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2747
            "shll      $16, %%edi                   \n\t"
2748
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2749
            "mov        %1, %%"REG_D"               \n\t"
2750
            "shrl       $9, %%esi                   \n\t"
2751
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2752

    
2753
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2754
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2755
            "add        $1, %%"REG_a"               \n\t"
2756
            "cmp        %2, %%"REG_a"               \n\t"
2757
            " jb        1b                          \n\t"
2758

    
2759
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2760
   which is needed to support GCC 4.0. */
2761
#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2762
            :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2763
#else
2764
            :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2765
#endif
2766
            "r" (src2)
2767
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2768
            );
2769
#if HAVE_MMX2
2770
        } //if MMX2 can't be used
2771
#endif
2772
#else
2773
        int i;
2774
        unsigned int xpos=0;
2775
        for (i=0;i<dstWidth;i++)
2776
        {
2777
            register unsigned int xx=xpos>>16;
2778
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2779
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2780
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2781
            /* slower
2782
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2783
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2784
            */
2785
            xpos+=xInc;
2786
        }
2787
#endif /* ARCH_X86 */
2788
    }
2789
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2790
        int i;
2791
        //FIXME all pal and rgb srcFormats could do this convertion as well
2792
        //FIXME all scalers more complex than bilinear could do half of this transform
2793
        if(c->srcRange){
2794
            for (i=0; i<dstWidth; i++){
2795
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2796
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2797
            }
2798
        }else{
2799
            for (i=0; i<dstWidth; i++){
2800
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2801
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2802
            }
2803
        }
2804
    }
2805
}
2806

    
2807
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2808
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2809

    
2810
    /* load a few things into local vars to make the code more readable? and faster */
2811
    const int srcW= c->srcW;
2812
    const int dstW= c->dstW;
2813
    const int dstH= c->dstH;
2814
    const int chrDstW= c->chrDstW;
2815
    const int chrSrcW= c->chrSrcW;
2816
    const int lumXInc= c->lumXInc;
2817
    const int chrXInc= c->chrXInc;
2818
    const int dstFormat= c->dstFormat;
2819
    const int srcFormat= c->srcFormat;
2820
    const int flags= c->flags;
2821
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2822
    int16_t *vLumFilterPos= c->vLumFilterPos;
2823
    int16_t *vChrFilterPos= c->vChrFilterPos;
2824
    int16_t *hLumFilterPos= c->hLumFilterPos;
2825
    int16_t *hChrFilterPos= c->hChrFilterPos;
2826
    int16_t *vLumFilter= c->vLumFilter;
2827
    int16_t *vChrFilter= c->vChrFilter;
2828
    int16_t *hLumFilter= c->hLumFilter;
2829
    int16_t *hChrFilter= c->hChrFilter;
2830
    int32_t *lumMmxFilter= c->lumMmxFilter;
2831
    int32_t *chrMmxFilter= c->chrMmxFilter;
2832
    int32_t *alpMmxFilter= c->alpMmxFilter;
2833
    const int vLumFilterSize= c->vLumFilterSize;
2834
    const int vChrFilterSize= c->vChrFilterSize;
2835
    const int hLumFilterSize= c->hLumFilterSize;
2836
    const int hChrFilterSize= c->hChrFilterSize;
2837
    int16_t **lumPixBuf= c->lumPixBuf;
2838
    int16_t **chrPixBuf= c->chrPixBuf;
2839
    int16_t **alpPixBuf= c->alpPixBuf;
2840
    const int vLumBufSize= c->vLumBufSize;
2841
    const int vChrBufSize= c->vChrBufSize;
2842
    uint8_t *funnyYCode= c->funnyYCode;
2843
    uint8_t *funnyUVCode= c->funnyUVCode;
2844
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2845
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2846
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2847
    int lastDstY;
2848
    uint32_t *pal=c->pal_yuv;
2849

    
2850
    /* vars which will change and which we need to store back in the context */
2851
    int dstY= c->dstY;
2852
    int lumBufIndex= c->lumBufIndex;
2853
    int chrBufIndex= c->chrBufIndex;
2854
    int lastInLumBuf= c->lastInLumBuf;
2855
    int lastInChrBuf= c->lastInChrBuf;
2856

    
2857
    if (isPacked(c->srcFormat)){
2858
        src[0]=
2859
        src[1]=
2860
        src[2]=
2861
        src[3]= src[0];
2862
        srcStride[0]=
2863
        srcStride[1]=
2864
        srcStride[2]=
2865
        srcStride[3]= srcStride[0];
2866
    }
2867
    srcStride[1]<<= c->vChrDrop;
2868
    srcStride[2]<<= c->vChrDrop;
2869

    
2870
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2871
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2872

    
2873
#if 0 //self test FIXME move to a vfilter or something
2874
    {
2875
    static volatile int i=0;
2876
    i++;
2877
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2878
        selfTest(src, srcStride, c->srcW, c->srcH);
2879
    i--;
2880
    }
2881
#endif
2882

    
2883
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2884
    //dstStride[0],dstStride[1],dstStride[2]);
2885

    
2886
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2887
    {
2888
        static int warnedAlready=0; //FIXME move this into the context perhaps
2889
        if (flags & SWS_PRINT_INFO && !warnedAlready)
2890
        {
2891
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2892
                   "         ->cannot do aligned memory accesses anymore\n");
2893
            warnedAlready=1;
2894
        }
2895
    }
2896

    
2897
    /* Note the user might start scaling the picture in the middle so this
2898
       will not get executed. This is not really intended but works
2899
       currently, so people might do it. */
2900
    if (srcSliceY ==0){
2901
        lumBufIndex=0;
2902
        chrBufIndex=0;
2903
        dstY=0;
2904
        lastInLumBuf= -1;
2905
        lastInChrBuf= -1;
2906
    }
2907

    
2908
    lastDstY= dstY;
2909

    
2910
    for (;dstY < dstH; dstY++){
2911
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2912
        const int chrDstY= dstY>>c->chrDstVSubSample;
2913
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2914
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2915
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2916

    
2917
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2918
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2919
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2920
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2921

    
2922
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2923
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2924
        //handle holes (FAST_BILINEAR & weird filters)
2925
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2926
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2927
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2928
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2929
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2930

    
2931
        // Do we have enough lines in this slice to output the dstY line
2932
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2933
        {
2934
            //Do horizontal scaling
2935
            while(lastInLumBuf < lastLumSrcY)
2936
            {
2937
                uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2938
                uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2939
                lumBufIndex++;
2940
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2941
                assert(lumBufIndex < 2*vLumBufSize);
2942
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2943
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2944
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
2945
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2946
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2947
                                funnyYCode, c->srcFormat, formatConvBuffer,
2948
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
2949
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2950
                    RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2951
                                    flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2952
                                    funnyYCode, c->srcFormat, formatConvBuffer,
2953
                                    c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
2954
                lastInLumBuf++;
2955
            }
2956
            while(lastInChrBuf < lastChrSrcY)
2957
            {
2958
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2959
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2960
                chrBufIndex++;
2961
                assert(chrBufIndex < 2*vChrBufSize);
2962
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2963
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2964
                //FIXME replace parameters through context struct (some at least)
2965

    
2966
                if (!(isGray(srcFormat) || isGray(dstFormat)))
2967
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2968
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2969
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
2970
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2971
                lastInChrBuf++;
2972
            }
2973
            //wrap buf index around to stay inside the ring buffer
2974
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2975
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2976
        }
2977
        else // not enough lines left in this slice -> load the rest in the buffer
2978
        {
2979
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2980
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2981
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2982
            vChrBufSize, vLumBufSize);*/
2983

    
2984
            //Do horizontal scaling
2985
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2986
            {
2987
                uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2988
                uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2989
                lumBufIndex++;
2990
                assert(lumBufIndex < 2*vLumBufSize);
2991
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2992
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2993
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2994
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2995
                                funnyYCode, c->srcFormat, formatConvBuffer,
2996
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 0);
2997
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2998
                    RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2999
                                    flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3000
                                    funnyYCode, c->srcFormat, formatConvBuffer,
3001
                                    c->lumMmx2Filter, c->lumMmx2FilterPos, pal, 1);
3002
                lastInLumBuf++;
3003
            }
3004
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3005
            {
3006
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3007
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3008
                chrBufIndex++;
3009
                assert(chrBufIndex < 2*vChrBufSize);
3010
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3011
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3012

    
3013
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3014
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3015
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3016
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3017
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3018
                lastInChrBuf++;
3019
            }
3020
            //wrap buf index around to stay inside the ring buffer
3021
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3022
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3023
            break; //we can't output a dstY line so let's try with the next slice
3024
        }
3025

    
3026
#if HAVE_MMX
3027
        c->blueDither= ff_dither8[dstY&1];
3028
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
3029
            c->greenDither= ff_dither8[dstY&1];
3030
        else
3031
            c->greenDither= ff_dither4[dstY&1];
3032
        c->redDither= ff_dither8[(dstY+1)&1];
3033
#endif
3034
        if (dstY < dstH-2)
3035
        {
3036
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3037
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3038
            int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
3039
#if HAVE_MMX
3040
            int i;
3041
        if (flags & SWS_ACCURATE_RND){
3042
            int s= APCK_SIZE / 8;
3043
            for (i=0; i<vLumFilterSize; i+=2){
3044
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
3045
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
3046
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
3047
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
3048
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3049
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
3050
                    *(void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
3051
                    *(void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
3052
                              alpMmxFilter[s*i+APCK_COEF/4  ]=
3053
                              alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
3054
                }
3055
            }
3056
            for (i=0; i<vChrFilterSize; i+=2){
3057
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
3058
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
3059
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
3060
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3061
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3062
            }
3063
        }else{
3064
            for (i=0; i<vLumFilterSize; i++)
3065
            {
3066
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3067
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3068
                lumMmxFilter[4*i+2]=
3069
                lumMmxFilter[4*i+3]=
3070
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3071
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
3072
                    alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
3073
                    alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
3074
                    alpMmxFilter[4*i+2]=
3075
                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
3076
                }
3077
            }
3078
            for (i=0; i<vChrFilterSize; i++)
3079
            {
3080
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3081
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3082
                chrMmxFilter[4*i+2]=
3083
                chrMmxFilter[4*i+3]=
3084
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3085
            }
3086
        }
3087
#endif
3088
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3089
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3090
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3091
                RENAME(yuv2nv12X)(c,
3092
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3093
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3094
                    dest, uDest, dstW, chrDstW, dstFormat);
3095
            }
3096
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3097
            {
3098
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3099
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3100
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3101
                {
3102
                    int16_t *lumBuf = lumPixBuf[0];
3103
                    int16_t *chrBuf= chrPixBuf[0];
3104
                    int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
3105
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
3106
                }
3107
                else //General YV12
3108
                {
3109
                    RENAME(yuv2yuvX)(c,
3110
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3111
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3112
                        alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
3113
                }
3114
            }
3115
            else
3116
            {
3117
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3118
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3119
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3120
                {
3121
                    int chrAlpha= vChrFilter[2*dstY+1];
3122
                    if(flags & SWS_FULL_CHR_H_INT){
3123
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3124
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3125
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3126
                            alpSrcPtr, dest, dstW, dstY);
3127
                    }else{
3128
                        RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3129
                            alpPixBuf ? *alpSrcPtr : NULL,
3130
                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
3131
                    }
3132
                }
3133
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3134
                {
3135
                    int lumAlpha= vLumFilter[2*dstY+1];
3136
                    int chrAlpha= vChrFilter[2*dstY+1];
3137
                    lumMmxFilter[2]=
3138
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3139
                    chrMmxFilter[2]=
3140
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3141
                    if(flags & SWS_FULL_CHR_H_INT){
3142
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3143
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3144
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3145
                            alpSrcPtr, dest, dstW, dstY);
3146
                    }else{
3147
                        RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3148
                            alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
3149
                            dest, dstW, lumAlpha, chrAlpha, dstY);
3150
                    }
3151
                }
3152
                else //general RGB
3153
                {
3154
                    if(flags & SWS_FULL_CHR_H_INT){
3155
                        yuv2rgbXinC_full(c,
3156
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3157
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3158
                            alpSrcPtr, dest, dstW, dstY);
3159
                    }else{
3160
                        RENAME(yuv2packedX)(c,
3161
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3162
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3163
                            alpSrcPtr, dest, dstW, dstY);
3164
                    }
3165
                }
3166
            }
3167
        }
3168
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3169
        {
3170
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3171
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3172
            int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
3173
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3174
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3175
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3176
                yuv2nv12XinC(
3177
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3178
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3179
                    dest, uDest, dstW, chrDstW, dstFormat);
3180
            }
3181
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3182
            {
3183
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3184
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3185
                yuv2yuvXinC(
3186
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3187
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3188
                    alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
3189
            }
3190
            else
3191
            {
3192
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3193
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3194
                if(flags & SWS_FULL_CHR_H_INT){
3195
                    yuv2rgbXinC_full(c,
3196
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3197
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3198
                        alpSrcPtr, dest, dstW, dstY);
3199
                }else{
3200
                    yuv2packedXinC(c,
3201
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3202
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3203
                        alpSrcPtr, dest, dstW, dstY);
3204
                }
3205
            }
3206
        }
3207
    }
3208

    
3209
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
3210
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
3211

    
3212
#if HAVE_MMX
3213
    __asm__ volatile(SFENCE:::"memory");
3214
    __asm__ volatile(EMMS:::"memory");
3215
#endif
3216
    /* store changed local vars back in the context */
3217
    c->dstY= dstY;
3218
    c->lumBufIndex= lumBufIndex;
3219
    c->chrBufIndex= chrBufIndex;
3220
    c->lastInLumBuf= lastInLumBuf;
3221
    c->lastInChrBuf= lastInChrBuf;
3222

    
3223
    return dstY - lastDstY;
3224
}