Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ f433c8ab

History | View | Annotate | Download (135 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddsw             %%mm7, %%mm0      \n\t"\
194
    "paddsw             %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX \
210
    asm volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232
\
233
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236
    "movq                      %%mm1, %%mm7         \n\t"\
237
    ASMALIGN(4)\
238
    "2:                                             \n\t"\
239
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242
    "add                         $16, %%"REG_d"            \n\t"\
243
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244
    "pmulhw                    %%mm0, %%mm2         \n\t"\
245
    "pmulhw                    %%mm0, %%mm5         \n\t"\
246
    "paddw                     %%mm2, %%mm1         \n\t"\
247
    "paddw                     %%mm5, %%mm7         \n\t"\
248
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
249
    " jnz                         2b                \n\t"\
250

    
251
#define YSCALEYUV2PACKEDX_END                 \
252
    :: "r" (&c->redDither),                   \
253
        "m" (dummy), "m" (dummy), "m" (dummy),\
254
        "r" (dest), "m" (dstW)                \
255
    : "%"REG_a, "%"REG_d, "%"REG_S            \
256
    );
257

    
258
#define YSCALEYUV2PACKEDX_ACCURATE \
259
    asm volatile(\
260
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
261
    ASMALIGN(4)\
262
    "nop                                            \n\t"\
263
    "1:                                             \n\t"\
264
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266
    "pxor                      %%mm4, %%mm4         \n\t"\
267
    "pxor                      %%mm5, %%mm5         \n\t"\
268
    "pxor                      %%mm6, %%mm6         \n\t"\
269
    "pxor                      %%mm7, %%mm7         \n\t"\
270
    ASMALIGN(4)\
271
    "2:                                             \n\t"\
272
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
275
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276
    "movq                      %%mm0, %%mm3         \n\t"\
277
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
278
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
279
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
280
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
282
    "paddd                     %%mm0, %%mm4         \n\t"\
283
    "paddd                     %%mm3, %%mm5         \n\t"\
284
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
286
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
287
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
288
    "movq                      %%mm2, %%mm0         \n\t"\
289
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
290
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
291
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
292
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
293
    "paddd                     %%mm2, %%mm6         \n\t"\
294
    "paddd                     %%mm0, %%mm7         \n\t"\
295
    " jnz                         2b                \n\t"\
296
    "psrad                       $16, %%mm4         \n\t"\
297
    "psrad                       $16, %%mm5         \n\t"\
298
    "psrad                       $16, %%mm6         \n\t"\
299
    "psrad                       $16, %%mm7         \n\t"\
300
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301
    "packssdw                  %%mm5, %%mm4         \n\t"\
302
    "packssdw                  %%mm7, %%mm6         \n\t"\
303
    "paddw                     %%mm0, %%mm4         \n\t"\
304
    "paddw                     %%mm0, %%mm6         \n\t"\
305
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307
\
308
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310
    "pxor                      %%mm1, %%mm1         \n\t"\
311
    "pxor                      %%mm5, %%mm5         \n\t"\
312
    "pxor                      %%mm7, %%mm7         \n\t"\
313
    "pxor                      %%mm6, %%mm6         \n\t"\
314
    ASMALIGN(4)\
315
    "2:                                             \n\t"\
316
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
319
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320
    "movq                      %%mm0, %%mm3         \n\t"\
321
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
322
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
323
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
325
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
326
    "paddd                     %%mm0, %%mm1         \n\t"\
327
    "paddd                     %%mm3, %%mm5         \n\t"\
328
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
330
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
331
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
332
    "movq                      %%mm2, %%mm0         \n\t"\
333
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
334
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
335
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
336
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
337
    "paddd                     %%mm2, %%mm7         \n\t"\
338
    "paddd                     %%mm0, %%mm6         \n\t"\
339
    " jnz                         2b                \n\t"\
340
    "psrad                       $16, %%mm1         \n\t"\
341
    "psrad                       $16, %%mm5         \n\t"\
342
    "psrad                       $16, %%mm7         \n\t"\
343
    "psrad                       $16, %%mm6         \n\t"\
344
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345
    "packssdw                  %%mm5, %%mm1         \n\t"\
346
    "packssdw                  %%mm6, %%mm7         \n\t"\
347
    "paddw                     %%mm0, %%mm1         \n\t"\
348
    "paddw                     %%mm0, %%mm7         \n\t"\
349
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351

    
352
#define YSCALEYUV2RGBX \
353
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367
    "paddw           %%mm3, %%mm4       \n\t"\
368
    "movq            %%mm2, %%mm0       \n\t"\
369
    "movq            %%mm5, %%mm6       \n\t"\
370
    "movq            %%mm4, %%mm3       \n\t"\
371
    "punpcklwd       %%mm2, %%mm2       \n\t"\
372
    "punpcklwd       %%mm5, %%mm5       \n\t"\
373
    "punpcklwd       %%mm4, %%mm4       \n\t"\
374
    "paddw           %%mm1, %%mm2       \n\t"\
375
    "paddw           %%mm1, %%mm5       \n\t"\
376
    "paddw           %%mm1, %%mm4       \n\t"\
377
    "punpckhwd       %%mm0, %%mm0       \n\t"\
378
    "punpckhwd       %%mm6, %%mm6       \n\t"\
379
    "punpckhwd       %%mm3, %%mm3       \n\t"\
380
    "paddw           %%mm7, %%mm0       \n\t"\
381
    "paddw           %%mm7, %%mm6       \n\t"\
382
    "paddw           %%mm7, %%mm3       \n\t"\
383
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384
    "packuswb        %%mm0, %%mm2       \n\t"\
385
    "packuswb        %%mm6, %%mm5       \n\t"\
386
    "packuswb        %%mm3, %%mm4       \n\t"\
387
    "pxor            %%mm7, %%mm7       \n\t"
388
#if 0
389
#define FULL_YSCALEYUV2RGB \
390
    "pxor                 %%mm7, %%mm7  \n\t"\
391
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392
    "punpcklwd            %%mm6, %%mm6  \n\t"\
393
    "punpcklwd            %%mm6, %%mm6  \n\t"\
394
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395
    "punpcklwd            %%mm5, %%mm5  \n\t"\
396
    "punpcklwd            %%mm5, %%mm5  \n\t"\
397
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
398
    ASMALIGN(4)\
399
    "1:                                 \n\t"\
400
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418
\
419
\
420
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427
\
428
\
429
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434
    "packuswb             %%mm3, %%mm3  \n\t"\
435
\
436
    "packuswb             %%mm0, %%mm0  \n\t"\
437
    "paddw                %%mm4, %%mm2  \n\t"\
438
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439
\
440
    "packuswb             %%mm1, %%mm1  \n\t"
441
#endif
442

    
443
#define REAL_YSCALEYUV2PACKED(index, c) \
444
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
445
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
446
    "psraw                $3, %%mm0                           \n\t"\
447
    "psraw                $3, %%mm1                           \n\t"\
448
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450
    "xor            "#index", "#index"                        \n\t"\
451
    ASMALIGN(4)\
452
    "1:                                 \n\t"\
453
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
454
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
455
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
456
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
457
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
460
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
467
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
468
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
469
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
470
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
471
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
472
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478

    
479
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
480

    
481
#define REAL_YSCALEYUV2RGB(index, c) \
482
    "xor            "#index", "#index"  \n\t"\
483
    ASMALIGN(4)\
484
    "1:                                 \n\t"\
485
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
486
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
487
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
488
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
489
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
492
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
499
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
500
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
501
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
502
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
503
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
504
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
506
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
507
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
508
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
509
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
510
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
511
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
518
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
519
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
520
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
521
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
522
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
523
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524
    "paddw             %%mm3, %%mm4     \n\t"\
525
    "movq              %%mm2, %%mm0     \n\t"\
526
    "movq              %%mm5, %%mm6     \n\t"\
527
    "movq              %%mm4, %%mm3     \n\t"\
528
    "punpcklwd         %%mm2, %%mm2     \n\t"\
529
    "punpcklwd         %%mm5, %%mm5     \n\t"\
530
    "punpcklwd         %%mm4, %%mm4     \n\t"\
531
    "paddw             %%mm1, %%mm2     \n\t"\
532
    "paddw             %%mm1, %%mm5     \n\t"\
533
    "paddw             %%mm1, %%mm4     \n\t"\
534
    "punpckhwd         %%mm0, %%mm0     \n\t"\
535
    "punpckhwd         %%mm6, %%mm6     \n\t"\
536
    "punpckhwd         %%mm3, %%mm3     \n\t"\
537
    "paddw             %%mm7, %%mm0     \n\t"\
538
    "paddw             %%mm7, %%mm6     \n\t"\
539
    "paddw             %%mm7, %%mm3     \n\t"\
540
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541
    "packuswb          %%mm0, %%mm2     \n\t"\
542
    "packuswb          %%mm6, %%mm5     \n\t"\
543
    "packuswb          %%mm3, %%mm4     \n\t"\
544
    "pxor              %%mm7, %%mm7     \n\t"
545
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
546

    
547
#define REAL_YSCALEYUV2PACKED1(index, c) \
548
    "xor            "#index", "#index"  \n\t"\
549
    ASMALIGN(4)\
550
    "1:                                 \n\t"\
551
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
552
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
553
    "psraw                $7, %%mm3     \n\t" \
554
    "psraw                $7, %%mm4     \n\t" \
555
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
556
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
557
    "psraw                $7, %%mm1     \n\t" \
558
    "psraw                $7, %%mm7     \n\t" \
559

    
560
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
561

    
562
#define REAL_YSCALEYUV2RGB1(index, c) \
563
    "xor            "#index", "#index"  \n\t"\
564
    ASMALIGN(4)\
565
    "1:                                 \n\t"\
566
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
568
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
571
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
572
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
573
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
574
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
575
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
576
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
578
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
579
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
582
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
583
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
584
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
585
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
586
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
587
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588
    "paddw             %%mm3, %%mm4     \n\t"\
589
    "movq              %%mm2, %%mm0     \n\t"\
590
    "movq              %%mm5, %%mm6     \n\t"\
591
    "movq              %%mm4, %%mm3     \n\t"\
592
    "punpcklwd         %%mm2, %%mm2     \n\t"\
593
    "punpcklwd         %%mm5, %%mm5     \n\t"\
594
    "punpcklwd         %%mm4, %%mm4     \n\t"\
595
    "paddw             %%mm1, %%mm2     \n\t"\
596
    "paddw             %%mm1, %%mm5     \n\t"\
597
    "paddw             %%mm1, %%mm4     \n\t"\
598
    "punpckhwd         %%mm0, %%mm0     \n\t"\
599
    "punpckhwd         %%mm6, %%mm6     \n\t"\
600
    "punpckhwd         %%mm3, %%mm3     \n\t"\
601
    "paddw             %%mm7, %%mm0     \n\t"\
602
    "paddw             %%mm7, %%mm6     \n\t"\
603
    "paddw             %%mm7, %%mm3     \n\t"\
604
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605
    "packuswb          %%mm0, %%mm2     \n\t"\
606
    "packuswb          %%mm6, %%mm5     \n\t"\
607
    "packuswb          %%mm3, %%mm4     \n\t"\
608
    "pxor              %%mm7, %%mm7     \n\t"
609
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
610

    
611
#define REAL_YSCALEYUV2PACKED1b(index, c) \
612
    "xor "#index", "#index"             \n\t"\
613
    ASMALIGN(4)\
614
    "1:                                 \n\t"\
615
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
616
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
617
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
618
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
619
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621
    "psrlw                $8, %%mm3     \n\t" \
622
    "psrlw                $8, %%mm4     \n\t" \
623
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
624
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
625
    "psraw                $7, %%mm1     \n\t" \
626
    "psraw                $7, %%mm7     \n\t"
627
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
628

    
629
// do vertical chrominance interpolation
630
#define REAL_YSCALEYUV2RGB1b(index, c) \
631
    "xor            "#index", "#index"  \n\t"\
632
    ASMALIGN(4)\
633
    "1:                                 \n\t"\
634
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
635
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
636
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
637
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
638
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
641
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
642
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
643
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
644
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
645
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
646
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
647
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
648
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
650
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
651
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
654
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
655
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
656
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
657
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
658
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
659
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660
    "paddw             %%mm3, %%mm4     \n\t"\
661
    "movq              %%mm2, %%mm0     \n\t"\
662
    "movq              %%mm5, %%mm6     \n\t"\
663
    "movq              %%mm4, %%mm3     \n\t"\
664
    "punpcklwd         %%mm2, %%mm2     \n\t"\
665
    "punpcklwd         %%mm5, %%mm5     \n\t"\
666
    "punpcklwd         %%mm4, %%mm4     \n\t"\
667
    "paddw             %%mm1, %%mm2     \n\t"\
668
    "paddw             %%mm1, %%mm5     \n\t"\
669
    "paddw             %%mm1, %%mm4     \n\t"\
670
    "punpckhwd         %%mm0, %%mm0     \n\t"\
671
    "punpckhwd         %%mm6, %%mm6     \n\t"\
672
    "punpckhwd         %%mm3, %%mm3     \n\t"\
673
    "paddw             %%mm7, %%mm0     \n\t"\
674
    "paddw             %%mm7, %%mm6     \n\t"\
675
    "paddw             %%mm7, %%mm3     \n\t"\
676
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677
    "packuswb          %%mm0, %%mm2     \n\t"\
678
    "packuswb          %%mm6, %%mm5     \n\t"\
679
    "packuswb          %%mm3, %%mm4     \n\t"\
680
    "pxor              %%mm7, %%mm7     \n\t"
681
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
682

    
683
#define REAL_WRITEBGR32(dst, dstw, index) \
684
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685
    "movq      %%mm2, %%mm1     \n\t" /* B */\
686
    "movq      %%mm5, %%mm6     \n\t" /* R */\
687
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
688
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
689
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
690
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
691
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
692
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
693
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
694
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
695
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
696
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
697
\
698
    MOVNTQ(%%mm0,   (dst, index, 4))\
699
    MOVNTQ(%%mm2,  8(dst, index, 4))\
700
    MOVNTQ(%%mm1, 16(dst, index, 4))\
701
    MOVNTQ(%%mm3, 24(dst, index, 4))\
702
\
703
    "add      $8, "#index"      \n\t"\
704
    "cmp "#dstw", "#index"      \n\t"\
705
    " jb      1b                \n\t"
706
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
707

    
708
#define REAL_WRITERGB16(dst, dstw, index) \
709
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
710
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
711
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
712
    "psrlq           $3, %%mm2  \n\t"\
713
\
714
    "movq         %%mm2, %%mm1  \n\t"\
715
    "movq         %%mm4, %%mm3  \n\t"\
716
\
717
    "punpcklbw    %%mm7, %%mm3  \n\t"\
718
    "punpcklbw    %%mm5, %%mm2  \n\t"\
719
    "punpckhbw    %%mm7, %%mm4  \n\t"\
720
    "punpckhbw    %%mm5, %%mm1  \n\t"\
721
\
722
    "psllq           $3, %%mm3  \n\t"\
723
    "psllq           $3, %%mm4  \n\t"\
724
\
725
    "por          %%mm3, %%mm2  \n\t"\
726
    "por          %%mm4, %%mm1  \n\t"\
727
\
728
    MOVNTQ(%%mm2,  (dst, index, 2))\
729
    MOVNTQ(%%mm1, 8(dst, index, 2))\
730
\
731
    "add             $8, "#index"   \n\t"\
732
    "cmp        "#dstw", "#index"   \n\t"\
733
    " jb             1b             \n\t"
734
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
735

    
736
#define REAL_WRITERGB15(dst, dstw, index) \
737
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
738
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
739
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
740
    "psrlq           $3, %%mm2  \n\t"\
741
    "psrlq           $1, %%mm5  \n\t"\
742
\
743
    "movq         %%mm2, %%mm1  \n\t"\
744
    "movq         %%mm4, %%mm3  \n\t"\
745
\
746
    "punpcklbw    %%mm7, %%mm3  \n\t"\
747
    "punpcklbw    %%mm5, %%mm2  \n\t"\
748
    "punpckhbw    %%mm7, %%mm4  \n\t"\
749
    "punpckhbw    %%mm5, %%mm1  \n\t"\
750
\
751
    "psllq           $2, %%mm3  \n\t"\
752
    "psllq           $2, %%mm4  \n\t"\
753
\
754
    "por          %%mm3, %%mm2  \n\t"\
755
    "por          %%mm4, %%mm1  \n\t"\
756
\
757
    MOVNTQ(%%mm2,  (dst, index, 2))\
758
    MOVNTQ(%%mm1, 8(dst, index, 2))\
759
\
760
    "add             $8, "#index"   \n\t"\
761
    "cmp        "#dstw", "#index"   \n\t"\
762
    " jb             1b             \n\t"
763
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
764

    
765
#define WRITEBGR24OLD(dst, dstw, index) \
766
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767
    "movq      %%mm2, %%mm1             \n\t" /* B */\
768
    "movq      %%mm5, %%mm6             \n\t" /* R */\
769
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
770
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
771
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
772
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
773
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
774
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
775
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
776
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
777
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
778
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
779
\
780
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
781
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
782
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
783
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
784
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
785
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
786
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
787
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
788
\
789
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
790
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
791
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
792
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
793
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
794
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
795
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
796
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
797
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
798
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
799
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
800
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
801
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
802
\
803
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
804
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
805
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
806
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
807
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
808
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
809
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
810
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
811
\
812
    MOVNTQ(%%mm0,   (dst))\
813
    MOVNTQ(%%mm2,  8(dst))\
814
    MOVNTQ(%%mm3, 16(dst))\
815
    "add         $24, "#dst"            \n\t"\
816
\
817
    "add          $8, "#index"          \n\t"\
818
    "cmp     "#dstw", "#index"          \n\t"\
819
    " jb          1b                    \n\t"
820

    
821
#define WRITEBGR24MMX(dst, dstw, index) \
822
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
    "movq      %%mm2, %%mm1     \n\t" /* B */\
824
    "movq      %%mm5, %%mm6     \n\t" /* R */\
825
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
826
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
827
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
828
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
829
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
830
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
831
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
832
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
833
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
834
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
835
\
836
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
837
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
838
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
839
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
840
\
841
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
842
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
843
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
844
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
845
\
846
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
847
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
848
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
849
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
850
\
851
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
852
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
853
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
854
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
855
    MOVNTQ(%%mm0, (dst))\
856
\
857
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
858
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
859
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
860
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
861
    MOVNTQ(%%mm6, 8(dst))\
862
\
863
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
864
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
865
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
866
    MOVNTQ(%%mm5, 16(dst))\
867
\
868
    "add         $24, "#dst"    \n\t"\
869
\
870
    "add          $8, "#index"  \n\t"\
871
    "cmp     "#dstw", "#index"  \n\t"\
872
    " jb          1b            \n\t"
873

    
874
#define WRITEBGR24MMX2(dst, dstw, index) \
875
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
879
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
880
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
881
\
882
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
883
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
884
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
885
\
886
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
887
    "por    %%mm1, %%mm6        \n\t"\
888
    "por    %%mm3, %%mm6        \n\t"\
889
    MOVNTQ(%%mm6, (dst))\
890
\
891
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
892
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
893
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
894
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
895
\
896
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
897
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
898
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
899
\
900
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
901
    "por    %%mm3, %%mm6        \n\t"\
902
    MOVNTQ(%%mm6, 8(dst))\
903
\
904
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
905
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
906
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
907
\
908
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
909
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
910
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
911
\
912
    "por    %%mm1, %%mm3        \n\t"\
913
    "por    %%mm3, %%mm6        \n\t"\
914
    MOVNTQ(%%mm6, 16(dst))\
915
\
916
    "add      $24, "#dst"       \n\t"\
917
\
918
    "add       $8, "#index"     \n\t"\
919
    "cmp  "#dstw", "#index"     \n\t"\
920
    " jb       1b               \n\t"
921

    
922
#ifdef HAVE_MMX2
923
#undef WRITEBGR24
924
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
925
#else
926
#undef WRITEBGR24
927
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
928
#endif
929

    
930
#define REAL_WRITEYUY2(dst, dstw, index) \
931
    "packuswb  %%mm3, %%mm3     \n\t"\
932
    "packuswb  %%mm4, %%mm4     \n\t"\
933
    "packuswb  %%mm7, %%mm1     \n\t"\
934
    "punpcklbw %%mm4, %%mm3     \n\t"\
935
    "movq      %%mm1, %%mm7     \n\t"\
936
    "punpcklbw %%mm3, %%mm1     \n\t"\
937
    "punpckhbw %%mm3, %%mm7     \n\t"\
938
\
939
    MOVNTQ(%%mm1, (dst, index, 2))\
940
    MOVNTQ(%%mm7, 8(dst, index, 2))\
941
\
942
    "add          $8, "#index"  \n\t"\
943
    "cmp     "#dstw", "#index"  \n\t"\
944
    " jb          1b            \n\t"
945
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
946

    
947

    
948
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951
{
952
#ifdef HAVE_MMX
953
    if(!(c->flags & SWS_BITEXACT)){
954
    if (c->flags & SWS_ACCURATE_RND){
955
        if (uDest){
956
            YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957
            YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
958
        }
959

    
960
        YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
961
    }else{
962
        if (uDest){
963
            YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964
            YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
965
        }
966

    
967
        YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
968
    }
969
        return;
970
    }
971
#endif
972
#ifdef HAVE_ALTIVEC
973
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
974
                      chrFilter, chrSrc, chrFilterSize,
975
                      dest, uDest, vDest, dstW, chrDstW);
976
#else //HAVE_ALTIVEC
977
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
978
            chrFilter, chrSrc, chrFilterSize,
979
            dest, uDest, vDest, dstW, chrDstW);
980
#endif //!HAVE_ALTIVEC
981
}
982

    
983
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
984
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
986
{
987
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
988
             chrFilter, chrSrc, chrFilterSize,
989
             dest, uDest, dstW, chrDstW, dstFormat);
990
}
991

    
992
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
993
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994
{
995
    int i;
996
#ifdef HAVE_MMX
997
    if(!(c->flags & SWS_BITEXACT)){
998
    long p= uDest ? 3 : 1;
999
    uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000
    uint8_t *dst[3]= {dest, uDest, vDest};
1001
    long counter[3] = {dstW, chrDstW, chrDstW};
1002

    
1003
    if (c->flags & SWS_ACCURATE_RND){
1004
        while(p--){
1005
            asm volatile(
1006
                YSCALEYUV2YV121_ACCURATE
1007
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1008
                "g" (-counter[p])
1009
                : "%"REG_a
1010
            );
1011
        }
1012
    }else{
1013
        while(p--){
1014
            asm volatile(
1015
                YSCALEYUV2YV121
1016
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1017
                "g" (-counter[p])
1018
                : "%"REG_a
1019
            );
1020
        }
1021
    }
1022
        return;
1023
    }
1024
#endif
1025
    for (i=0; i<dstW; i++)
1026
    {
1027
        int val= (lumSrc[i]+64)>>7;
1028

    
1029
        if (val&256){
1030
            if (val<0) val=0;
1031
            else       val=255;
1032
        }
1033

    
1034
        dest[i]= val;
1035
    }
1036

    
1037
    if (uDest)
1038
        for (i=0; i<chrDstW; i++)
1039
        {
1040
            int u=(chrSrc[i       ]+64)>>7;
1041
            int v=(chrSrc[i + VOFW]+64)>>7;
1042

    
1043
            if ((u|v)&256){
1044
                if (u<0)        u=0;
1045
                else if (u>255) u=255;
1046
                if (v<0)        v=0;
1047
                else if (v>255) v=255;
1048
            }
1049

    
1050
            uDest[i]= u;
1051
            vDest[i]= v;
1052
        }
1053
}
1054

    
1055

    
1056
/**
1057
 * vertical scale YV12 to RGB
1058
 */
1059
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1060
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061
                                       uint8_t *dest, long dstW, long dstY)
1062
{
1063
#ifdef HAVE_MMX
1064
    long dummy=0;
1065
    if(!(c->flags & SWS_BITEXACT)){
1066
    if (c->flags & SWS_ACCURATE_RND){
1067
        switch(c->dstFormat){
1068
        case PIX_FMT_RGB32:
1069
            YSCALEYUV2PACKEDX_ACCURATE
1070
            YSCALEYUV2RGBX
1071
            WRITEBGR32(%4, %5, %%REGa)
1072

    
1073
            YSCALEYUV2PACKEDX_END
1074
            return;
1075
        case PIX_FMT_BGR24:
1076
            YSCALEYUV2PACKEDX_ACCURATE
1077
            YSCALEYUV2RGBX
1078
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079
            "add %4, %%"REG_c"                        \n\t"
1080
            WRITEBGR24(%%REGc, %5, %%REGa)
1081

    
1082

    
1083
            :: "r" (&c->redDither),
1084
               "m" (dummy), "m" (dummy), "m" (dummy),
1085
               "r" (dest), "m" (dstW)
1086
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1087
            );
1088
            return;
1089
        case PIX_FMT_RGB555:
1090
            YSCALEYUV2PACKEDX_ACCURATE
1091
            YSCALEYUV2RGBX
1092
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093
#ifdef DITHER1XBPP
1094
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1095
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1096
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1097
#endif
1098

    
1099
            WRITERGB15(%4, %5, %%REGa)
1100
            YSCALEYUV2PACKEDX_END
1101
            return;
1102
        case PIX_FMT_RGB565:
1103
            YSCALEYUV2PACKEDX_ACCURATE
1104
            YSCALEYUV2RGBX
1105
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1106
#ifdef DITHER1XBPP
1107
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1108
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1109
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1110
#endif
1111

    
1112
            WRITERGB16(%4, %5, %%REGa)
1113
            YSCALEYUV2PACKEDX_END
1114
            return;
1115
        case PIX_FMT_YUYV422:
1116
            YSCALEYUV2PACKEDX_ACCURATE
1117
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1118

    
1119
            "psraw $3, %%mm3    \n\t"
1120
            "psraw $3, %%mm4    \n\t"
1121
            "psraw $3, %%mm1    \n\t"
1122
            "psraw $3, %%mm7    \n\t"
1123
            WRITEYUY2(%4, %5, %%REGa)
1124
            YSCALEYUV2PACKEDX_END
1125
            return;
1126
    }
1127
    }else{
1128
        switch(c->dstFormat)
1129
        {
1130
        case PIX_FMT_RGB32:
1131
            YSCALEYUV2PACKEDX
1132
            YSCALEYUV2RGBX
1133
            WRITEBGR32(%4, %5, %%REGa)
1134
            YSCALEYUV2PACKEDX_END
1135
            return;
1136
        case PIX_FMT_BGR24:
1137
            YSCALEYUV2PACKEDX
1138
            YSCALEYUV2RGBX
1139
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1140
            "add                        %4, %%"REG_c"   \n\t"
1141
            WRITEBGR24(%%REGc, %5, %%REGa)
1142

    
1143
            :: "r" (&c->redDither),
1144
               "m" (dummy), "m" (dummy), "m" (dummy),
1145
               "r" (dest),  "m" (dstW)
1146
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1147
            );
1148
            return;
1149
        case PIX_FMT_RGB555:
1150
            YSCALEYUV2PACKEDX
1151
            YSCALEYUV2RGBX
1152
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153
#ifdef DITHER1XBPP
1154
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1155
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1156
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1157
#endif
1158

    
1159
            WRITERGB15(%4, %5, %%REGa)
1160
            YSCALEYUV2PACKEDX_END
1161
            return;
1162
        case PIX_FMT_RGB565:
1163
            YSCALEYUV2PACKEDX
1164
            YSCALEYUV2RGBX
1165
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166
#ifdef DITHER1XBPP
1167
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1168
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1169
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1170
#endif
1171

    
1172
            WRITERGB16(%4, %5, %%REGa)
1173
            YSCALEYUV2PACKEDX_END
1174
            return;
1175
        case PIX_FMT_YUYV422:
1176
            YSCALEYUV2PACKEDX
1177
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178

    
1179
            "psraw $3, %%mm3    \n\t"
1180
            "psraw $3, %%mm4    \n\t"
1181
            "psraw $3, %%mm1    \n\t"
1182
            "psraw $3, %%mm7    \n\t"
1183
            WRITEYUY2(%4, %5, %%REGa)
1184
            YSCALEYUV2PACKEDX_END
1185
            return;
1186
        }
1187
    }
1188
    }
1189
#endif /* HAVE_MMX */
1190
#ifdef HAVE_ALTIVEC
1191
    /* The following list of supported dstFormat values should
1192
       match what's found in the body of altivec_yuv2packedX() */
1193
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1194
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1196
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197
                                 chrFilter, chrSrc, chrFilterSize,
1198
                                 dest, dstW, dstY);
1199
    else
1200
#endif
1201
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202
                       chrFilter, chrSrc, chrFilterSize,
1203
                       dest, dstW, dstY);
1204
}
1205

    
1206
/**
1207
 * vertical bilinear scale YV12 to RGB
1208
 */
1209
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1210
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1211
{
1212
    int  yalpha1=4095- yalpha;
1213
    int uvalpha1=4095-uvalpha;
1214
    int i;
1215

    
1216
#if 0 //isn't used
1217
    if (flags&SWS_FULL_CHR_H_INT)
1218
    {
1219
        switch(dstFormat)
1220
        {
1221
#ifdef HAVE_MMX
1222
        case PIX_FMT_RGB32:
1223
            asm volatile(
1224

1225

1226
FULL_YSCALEYUV2RGB
1227
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1228
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1229

1230
            "movq      %%mm3, %%mm1    \n\t"
1231
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1232
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1233

1234
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1235
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1236

1237
            "add $4, %%"REG_a"  \n\t"
1238
            "cmp %5, %%"REG_a"  \n\t"
1239
            " jb 1b             \n\t"
1240

1241
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242
            "m" (yalpha1), "m" (uvalpha1)
1243
            : "%"REG_a
1244
            );
1245
            break;
1246
        case PIX_FMT_BGR24:
1247
            asm volatile(
1248

1249
FULL_YSCALEYUV2RGB
1250

1251
                                              // lsb ... msb
1252
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1253
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1254

1255
            "movq      %%mm3, %%mm1     \n\t"
1256
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1257
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1258

1259
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1260
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1261
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1262
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1263
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1264
            "movq      %%mm1, %%mm2     \n\t"
1265
            "psllq       $48, %%mm1     \n\t" // 000000BG
1266
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1267

1268
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1269
            "psrld       $16, %%mm2     \n\t" // R000R000
1270
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1271
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1272

1273
            "mov          %4, %%"REG_b" \n\t"
1274
            "add   %%"REG_a", %%"REG_b" \n\t"
1275

1276
#ifdef HAVE_MMX2
1277
            //FIXME Alignment
1278
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1279
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1280
#else
1281
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1282
            "psrlq  $32, %%mm3                          \n\t"
1283
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1284
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1285
#endif
1286
            "add     $4, %%"REG_a"                      \n\t"
1287
            "cmp     %5, %%"REG_a"                      \n\t"
1288
            " jb     1b                                 \n\t"
1289

    
1290
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291
            "m" (yalpha1), "m" (uvalpha1)
1292
            : "%"REG_a, "%"REG_b
1293
            );
1294
            break;
1295
        case PIX_FMT_BGR555:
1296
            asm volatile(
1297

    
1298
FULL_YSCALEYUV2RGB
1299
#ifdef DITHER1XBPP
1300
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1301
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1302
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1303
#endif
1304
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1305
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1306
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1307

    
1308
            "psrlw                   $3, %%mm3  \n\t"
1309
            "psllw                   $2, %%mm1  \n\t"
1310
            "psllw                   $7, %%mm0  \n\t"
1311
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1312
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1313

    
1314
            "por                  %%mm3, %%mm1  \n\t"
1315
            "por                  %%mm1, %%mm0  \n\t"
1316

    
1317
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1318

    
1319
            "add $4, %%"REG_a"  \n\t"
1320
            "cmp %5, %%"REG_a"  \n\t"
1321
            " jb 1b             \n\t"
1322

    
1323
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324
            "m" (yalpha1), "m" (uvalpha1)
1325
            : "%"REG_a
1326
            );
1327
            break;
1328
        case PIX_FMT_BGR565:
1329
            asm volatile(
1330

    
1331
FULL_YSCALEYUV2RGB
1332
#ifdef DITHER1XBPP
1333
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1334
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1335
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1336
#endif
1337
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1338
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1339
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1340

    
1341
            "psrlw                   $3, %%mm3  \n\t"
1342
            "psllw                   $3, %%mm1  \n\t"
1343
            "psllw                   $8, %%mm0  \n\t"
1344
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1345
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1346

    
1347
            "por                  %%mm3, %%mm1  \n\t"
1348
            "por                  %%mm1, %%mm0  \n\t"
1349

    
1350
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1351

    
1352
            "add $4, %%"REG_a"  \n\t"
1353
            "cmp %5, %%"REG_a"  \n\t"
1354
            " jb 1b             \n\t"
1355

    
1356
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357
            "m" (yalpha1), "m" (uvalpha1)
1358
            : "%"REG_a
1359
            );
1360
            break;
1361
#endif /* HAVE_MMX */
1362
        case PIX_FMT_BGR32:
1363
#ifndef HAVE_MMX
1364
        case PIX_FMT_RGB32:
1365
#endif
1366
            if (dstFormat==PIX_FMT_RGB32)
1367
            {
1368
                int i;
1369
#ifdef WORDS_BIGENDIAN
1370
                dest++;
1371
#endif
1372
                for (i=0;i<dstW;i++){
1373
                    // vertical linear interpolation && yuv2rgb in a single step:
1374
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380
                    dest+= 4;
1381
                }
1382
            }
1383
            else if (dstFormat==PIX_FMT_BGR24)
1384
            {
1385
                int i;
1386
                for (i=0;i<dstW;i++){
1387
                    // vertical linear interpolation && yuv2rgb in a single step:
1388
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394
                    dest+= 3;
1395
                }
1396
            }
1397
            else if (dstFormat==PIX_FMT_BGR565)
1398
            {
1399
                int i;
1400
                for (i=0;i<dstW;i++){
1401
                    // vertical linear interpolation && yuv2rgb in a single step:
1402
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1405

    
1406
                    ((uint16_t*)dest)[i] =
1407
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1410
                }
1411
            }
1412
            else if (dstFormat==PIX_FMT_BGR555)
1413
            {
1414
                int i;
1415
                for (i=0;i<dstW;i++){
1416
                    // vertical linear interpolation && yuv2rgb in a single step:
1417
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1420

    
1421
                    ((uint16_t*)dest)[i] =
1422
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1425
                }
1426
            }
1427
        }//FULL_UV_IPOL
1428
    else
1429
    {
1430
#endif // if 0
1431
#ifdef HAVE_MMX
1432
    if(!(c->flags & SWS_BITEXACT)){
1433
        switch(c->dstFormat)
1434
        {
1435
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1436
            case PIX_FMT_RGB32:
1437
                asm volatile(
1438
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1439
                "mov        %4, %%"REG_b"               \n\t"
1440
                "push %%"REG_BP"                        \n\t"
1441
                YSCALEYUV2RGB(%%REGBP, %5)
1442
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443
                "pop %%"REG_BP"                         \n\t"
1444
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1445

    
1446
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447
                "a" (&c->redDither)
1448
                );
1449
                return;
1450
            case PIX_FMT_BGR24:
1451
                asm volatile(
1452
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1453
                "mov        %4, %%"REG_b"               \n\t"
1454
                "push %%"REG_BP"                        \n\t"
1455
                YSCALEYUV2RGB(%%REGBP, %5)
1456
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457
                "pop %%"REG_BP"                         \n\t"
1458
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460
                "a" (&c->redDither)
1461
                );
1462
                return;
1463
            case PIX_FMT_RGB555:
1464
                asm volatile(
1465
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1466
                "mov        %4, %%"REG_b"               \n\t"
1467
                "push %%"REG_BP"                        \n\t"
1468
                YSCALEYUV2RGB(%%REGBP, %5)
1469
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1470
#ifdef DITHER1XBPP
1471
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1472
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1473
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1474
#endif
1475

    
1476
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1477
                "pop %%"REG_BP"                         \n\t"
1478
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1479

    
1480
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481
                "a" (&c->redDither)
1482
                );
1483
                return;
1484
            case PIX_FMT_RGB565:
1485
                asm volatile(
1486
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1487
                "mov        %4, %%"REG_b"               \n\t"
1488
                "push %%"REG_BP"                        \n\t"
1489
                YSCALEYUV2RGB(%%REGBP, %5)
1490
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1491
#ifdef DITHER1XBPP
1492
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1493
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1494
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1495
#endif
1496

    
1497
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1498
                "pop %%"REG_BP"                         \n\t"
1499
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1500
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1501
                "a" (&c->redDither)
1502
                );
1503
                return;
1504
            case PIX_FMT_YUYV422:
1505
                asm volatile(
1506
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1507
                "mov %4, %%"REG_b"                        \n\t"
1508
                "push %%"REG_BP"                        \n\t"
1509
                YSCALEYUV2PACKED(%%REGBP, %5)
1510
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511
                "pop %%"REG_BP"                         \n\t"
1512
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1513
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514
                "a" (&c->redDither)
1515
                );
1516
                return;
1517
            default: break;
1518
        }
1519
    }
1520
#endif //HAVE_MMX
1521
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1522
}
1523

    
1524
/**
1525
 * YV12 to RGB without scaling or interpolating
1526
 */
1527
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1528
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1529
{
1530
    const int yalpha1=0;
1531
    int i;
1532

    
1533
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1534
    const int yalpha= 4096; //FIXME ...
1535

    
1536
    if (flags&SWS_FULL_CHR_H_INT)
1537
    {
1538
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1539
        return;
1540
    }
1541

    
1542
#ifdef HAVE_MMX
1543
    if(!(flags & SWS_BITEXACT)){
1544
    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1545
    {
1546
        switch(dstFormat)
1547
        {
1548
        case PIX_FMT_RGB32:
1549
            asm volatile(
1550
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
            "mov        %4, %%"REG_b"               \n\t"
1552
            "push %%"REG_BP"                        \n\t"
1553
            YSCALEYUV2RGB1(%%REGBP, %5)
1554
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555
            "pop %%"REG_BP"                         \n\t"
1556
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1557

    
1558
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1559
            "a" (&c->redDither)
1560
            );
1561
            return;
1562
        case PIX_FMT_BGR24:
1563
            asm volatile(
1564
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1565
            "mov        %4, %%"REG_b"               \n\t"
1566
            "push %%"REG_BP"                        \n\t"
1567
            YSCALEYUV2RGB1(%%REGBP, %5)
1568
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569
            "pop %%"REG_BP"                         \n\t"
1570
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1571

    
1572
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573
            "a" (&c->redDither)
1574
            );
1575
            return;
1576
        case PIX_FMT_RGB555:
1577
            asm volatile(
1578
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1579
            "mov        %4, %%"REG_b"               \n\t"
1580
            "push %%"REG_BP"                        \n\t"
1581
            YSCALEYUV2RGB1(%%REGBP, %5)
1582
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1583
#ifdef DITHER1XBPP
1584
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1585
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1586
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1587
#endif
1588
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589
            "pop %%"REG_BP"                         \n\t"
1590
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1591

    
1592
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593
            "a" (&c->redDither)
1594
            );
1595
            return;
1596
        case PIX_FMT_RGB565:
1597
            asm volatile(
1598
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1599
            "mov        %4, %%"REG_b"               \n\t"
1600
            "push %%"REG_BP"                        \n\t"
1601
            YSCALEYUV2RGB1(%%REGBP, %5)
1602
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1603
#ifdef DITHER1XBPP
1604
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1605
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1606
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1607
#endif
1608

    
1609
            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610
            "pop %%"REG_BP"                         \n\t"
1611
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1612

    
1613
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1614
            "a" (&c->redDither)
1615
            );
1616
            return;
1617
        case PIX_FMT_YUYV422:
1618
            asm volatile(
1619
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1620
            "mov        %4, %%"REG_b"               \n\t"
1621
            "push %%"REG_BP"                        \n\t"
1622
            YSCALEYUV2PACKED1(%%REGBP, %5)
1623
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624
            "pop %%"REG_BP"                         \n\t"
1625
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1626

    
1627
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1628
            "a" (&c->redDither)
1629
            );
1630
            return;
1631
        }
1632
    }
1633
    else
1634
    {
1635
        switch(dstFormat)
1636
        {
1637
        case PIX_FMT_RGB32:
1638
            asm volatile(
1639
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1640
            "mov        %4, %%"REG_b"               \n\t"
1641
            "push %%"REG_BP"                        \n\t"
1642
            YSCALEYUV2RGB1b(%%REGBP, %5)
1643
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644
            "pop %%"REG_BP"                         \n\t"
1645
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1646

    
1647
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1648
            "a" (&c->redDither)
1649
            );
1650
            return;
1651
        case PIX_FMT_BGR24:
1652
            asm volatile(
1653
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1654
            "mov        %4, %%"REG_b"               \n\t"
1655
            "push %%"REG_BP"                        \n\t"
1656
            YSCALEYUV2RGB1b(%%REGBP, %5)
1657
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658
            "pop %%"REG_BP"                         \n\t"
1659
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1660

    
1661
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1662
            "a" (&c->redDither)
1663
            );
1664
            return;
1665
        case PIX_FMT_RGB555:
1666
            asm volatile(
1667
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1668
            "mov        %4, %%"REG_b"               \n\t"
1669
            "push %%"REG_BP"                        \n\t"
1670
            YSCALEYUV2RGB1b(%%REGBP, %5)
1671
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1672
#ifdef DITHER1XBPP
1673
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1674
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1675
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1676
#endif
1677
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678
            "pop %%"REG_BP"                         \n\t"
1679
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1680

    
1681
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1682
            "a" (&c->redDither)
1683
            );
1684
            return;
1685
        case PIX_FMT_RGB565:
1686
            asm volatile(
1687
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1688
            "mov        %4, %%"REG_b"               \n\t"
1689
            "push %%"REG_BP"                        \n\t"
1690
            YSCALEYUV2RGB1b(%%REGBP, %5)
1691
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1692
#ifdef DITHER1XBPP
1693
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1694
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1695
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1696
#endif
1697

    
1698
            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699
            "pop %%"REG_BP"                         \n\t"
1700
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1701

    
1702
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1703
            "a" (&c->redDither)
1704
            );
1705
            return;
1706
        case PIX_FMT_YUYV422:
1707
            asm volatile(
1708
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1709
            "mov        %4, %%"REG_b"               \n\t"
1710
            "push %%"REG_BP"                        \n\t"
1711
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1712
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713
            "pop %%"REG_BP"                         \n\t"
1714
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1715

    
1716
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1717
            "a" (&c->redDither)
1718
            );
1719
            return;
1720
        }
1721
    }
1722
    }
1723
#endif /* HAVE_MMX */
1724
    if (uvalpha < 2048)
1725
    {
1726
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1727
    }else{
1728
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1729
    }
1730
}
1731

    
1732
//FIXME yuy2* can read up to 7 samples too much
1733

    
1734
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1735
{
1736
#ifdef HAVE_MMX
1737
    asm volatile(
1738
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1739
    "mov                    %0, %%"REG_a"       \n\t"
1740
    "1:                                         \n\t"
1741
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1742
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1743
    "pand                %%mm2, %%mm0           \n\t"
1744
    "pand                %%mm2, %%mm1           \n\t"
1745
    "packuswb            %%mm1, %%mm0           \n\t"
1746
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1747
    "add                    $8, %%"REG_a"       \n\t"
1748
    " js                    1b                  \n\t"
1749
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1750
    : "%"REG_a
1751
    );
1752
#else
1753
    int i;
1754
    for (i=0; i<width; i++)
1755
        dst[i]= src[2*i];
1756
#endif
1757
}
1758

    
1759
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1760
{
1761
#ifdef HAVE_MMX
1762
    asm volatile(
1763
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1764
    "mov                    %0, %%"REG_a"       \n\t"
1765
    "1:                                         \n\t"
1766
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1767
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1768
    "psrlw                  $8, %%mm0           \n\t"
1769
    "psrlw                  $8, %%mm1           \n\t"
1770
    "packuswb            %%mm1, %%mm0           \n\t"
1771
    "movq                %%mm0, %%mm1           \n\t"
1772
    "psrlw                  $8, %%mm0           \n\t"
1773
    "pand                %%mm4, %%mm1           \n\t"
1774
    "packuswb            %%mm0, %%mm0           \n\t"
1775
    "packuswb            %%mm1, %%mm1           \n\t"
1776
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1777
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1778
    "add                    $4, %%"REG_a"       \n\t"
1779
    " js                    1b                  \n\t"
1780
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1781
    : "%"REG_a
1782
    );
1783
#else
1784
    int i;
1785
    for (i=0; i<width; i++)
1786
    {
1787
        dstU[i]= src1[4*i + 1];
1788
        dstV[i]= src1[4*i + 3];
1789
    }
1790
#endif
1791
    assert(src1 == src2);
1792
}
1793

    
1794
/* This is almost identical to the previous, end exists only because
1795
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1796
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1797
{
1798
#ifdef HAVE_MMX
1799
    asm volatile(
1800
    "mov                  %0, %%"REG_a"         \n\t"
1801
    "1:                                         \n\t"
1802
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1803
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1804
    "psrlw                $8, %%mm0             \n\t"
1805
    "psrlw                $8, %%mm1             \n\t"
1806
    "packuswb          %%mm1, %%mm0             \n\t"
1807
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1808
    "add                  $8, %%"REG_a"         \n\t"
1809
    " js                  1b                    \n\t"
1810
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1811
    : "%"REG_a
1812
    );
1813
#else
1814
    int i;
1815
    for (i=0; i<width; i++)
1816
        dst[i]= src[2*i+1];
1817
#endif
1818
}
1819

    
1820
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1821
{
1822
#ifdef HAVE_MMX
1823
    asm volatile(
1824
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1825
    "mov                    %0, %%"REG_a"       \n\t"
1826
    "1:                                         \n\t"
1827
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1828
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1829
    "pand                %%mm4, %%mm0           \n\t"
1830
    "pand                %%mm4, %%mm1           \n\t"
1831
    "packuswb            %%mm1, %%mm0           \n\t"
1832
    "movq                %%mm0, %%mm1           \n\t"
1833
    "psrlw                  $8, %%mm0           \n\t"
1834
    "pand                %%mm4, %%mm1           \n\t"
1835
    "packuswb            %%mm0, %%mm0           \n\t"
1836
    "packuswb            %%mm1, %%mm1           \n\t"
1837
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1838
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1839
    "add                    $4, %%"REG_a"       \n\t"
1840
    " js                    1b                  \n\t"
1841
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1842
    : "%"REG_a
1843
    );
1844
#else
1845
    int i;
1846
    for (i=0; i<width; i++)
1847
    {
1848
        dstU[i]= src1[4*i + 0];
1849
        dstV[i]= src1[4*i + 2];
1850
    }
1851
#endif
1852
    assert(src1 == src2);
1853
}
1854

    
1855
#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856
static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1857
{\
1858
    int i;\
1859
    for (i=0; i<width; i++)\
1860
    {\
1861
        int b= (((type*)src)[i]>>shb)&maskb;\
1862
        int g= (((type*)src)[i]>>shg)&maskg;\
1863
        int r= (((type*)src)[i]>>shr)&maskr;\
1864
\
1865
        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1866
    }\
1867
}
1868

    
1869
BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1870
BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1871
BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1872
BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1873
BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874
BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1875

    
1876
#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877
static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1878
{\
1879
    int i;\
1880
    for (i=0; i<width; i++)\
1881
    {\
1882
        int b= (((type*)src)[i]&maskb)>>shb;\
1883
        int g= (((type*)src)[i]&maskg)>>shg;\
1884
        int r= (((type*)src)[i]&maskr)>>shr;\
1885
\
1886
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1888
    }\
1889
}\
1890
static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1891
{\
1892
    int i;\
1893
    for (i=0; i<width; i++)\
1894
    {\
1895
        int pix0= ((type*)src)[2*i+0];\
1896
        int pix1= ((type*)src)[2*i+1];\
1897
        int g= (pix0&maskg)+(pix1&maskg);\
1898
        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899
        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1900
\
1901
        g>>=shg;\
1902
\
1903
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1905
    }\
1906
}
1907

    
1908
BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1909
BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1910
BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1911
BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1912
BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913
BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1914

    
1915
#ifdef HAVE_MMX
1916
static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1917
{
1918

    
1919
    if(srcFormat == PIX_FMT_BGR24){
1920
        asm volatile(
1921
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1922
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1923
            :
1924
        );
1925
    }else{
1926
        asm volatile(
1927
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1928
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1929
            :
1930
        );
1931
    }
1932

    
1933
    asm volatile(
1934
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1935
        "mov                        %2, %%"REG_a"   \n\t"
1936
        "pxor                    %%mm7, %%mm7       \n\t"
1937
        "1:                                         \n\t"
1938
        PREFETCH"               64(%0)              \n\t"
1939
        "movd                     (%0), %%mm0       \n\t"
1940
        "movd                    2(%0), %%mm1       \n\t"
1941
        "movd                    6(%0), %%mm2       \n\t"
1942
        "movd                    8(%0), %%mm3       \n\t"
1943
        "add                       $12, %0          \n\t"
1944
        "punpcklbw               %%mm7, %%mm0       \n\t"
1945
        "punpcklbw               %%mm7, %%mm1       \n\t"
1946
        "punpcklbw               %%mm7, %%mm2       \n\t"
1947
        "punpcklbw               %%mm7, %%mm3       \n\t"
1948
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1949
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1950
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1951
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1952
        "paddd                   %%mm1, %%mm0       \n\t"
1953
        "paddd                   %%mm3, %%mm2       \n\t"
1954
        "paddd                   %%mm4, %%mm0       \n\t"
1955
        "paddd                   %%mm4, %%mm2       \n\t"
1956
        "psrad                     $15, %%mm0       \n\t"
1957
        "psrad                     $15, %%mm2       \n\t"
1958
        "packssdw                %%mm2, %%mm0       \n\t"
1959
        "packuswb                %%mm0, %%mm0       \n\t"
1960
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1961
        "add                        $4, %%"REG_a"   \n\t"
1962
        " js                        1b              \n\t"
1963
    : "+r" (src)
1964
    : "r" (dst+width), "g" (-width)
1965
    : "%"REG_a
1966
    );
1967
}
1968

    
1969
static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1970
{
1971
    asm volatile(
1972
        "movq                    24+%4, %%mm6       \n\t"
1973
        "mov                        %3, %%"REG_a"   \n\t"
1974
        "pxor                    %%mm7, %%mm7       \n\t"
1975
        "1:                                         \n\t"
1976
        PREFETCH"               64(%0)              \n\t"
1977
        "movd                     (%0), %%mm0       \n\t"
1978
        "movd                    2(%0), %%mm1       \n\t"
1979
        "punpcklbw               %%mm7, %%mm0       \n\t"
1980
        "punpcklbw               %%mm7, %%mm1       \n\t"
1981
        "movq                    %%mm0, %%mm2       \n\t"
1982
        "movq                    %%mm1, %%mm3       \n\t"
1983
        "pmaddwd                    %4, %%mm0       \n\t"
1984
        "pmaddwd                  8+%4, %%mm1       \n\t"
1985
        "pmaddwd                 16+%4, %%mm2       \n\t"
1986
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1987
        "paddd                   %%mm1, %%mm0       \n\t"
1988
        "paddd                   %%mm3, %%mm2       \n\t"
1989

    
1990
        "movd                    6(%0), %%mm1       \n\t"
1991
        "movd                    8(%0), %%mm3       \n\t"
1992
        "add                       $12, %0          \n\t"
1993
        "punpcklbw               %%mm7, %%mm1       \n\t"
1994
        "punpcklbw               %%mm7, %%mm3       \n\t"
1995
        "movq                    %%mm1, %%mm4       \n\t"
1996
        "movq                    %%mm3, %%mm5       \n\t"
1997
        "pmaddwd                    %4, %%mm1       \n\t"
1998
        "pmaddwd                  8+%4, %%mm3       \n\t"
1999
        "pmaddwd                 16+%4, %%mm4       \n\t"
2000
        "pmaddwd                 %%mm6, %%mm5       \n\t"
2001
        "paddd                   %%mm3, %%mm1       \n\t"
2002
        "paddd                   %%mm5, %%mm4       \n\t"
2003

    
2004
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
2005
        "paddd                   %%mm3, %%mm0       \n\t"
2006
        "paddd                   %%mm3, %%mm2       \n\t"
2007
        "paddd                   %%mm3, %%mm1       \n\t"
2008
        "paddd                   %%mm3, %%mm4       \n\t"
2009
        "psrad                     $15, %%mm0       \n\t"
2010
        "psrad                     $15, %%mm2       \n\t"
2011
        "psrad                     $15, %%mm1       \n\t"
2012
        "psrad                     $15, %%mm4       \n\t"
2013
        "packssdw                %%mm1, %%mm0       \n\t"
2014
        "packssdw                %%mm4, %%mm2       \n\t"
2015
        "packuswb                %%mm0, %%mm0       \n\t"
2016
        "packuswb                %%mm2, %%mm2       \n\t"
2017
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
2018
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
2019
        "add                        $4, %%"REG_a"   \n\t"
2020
        " js                        1b              \n\t"
2021
    : "+r" (src)
2022
    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2023
    : "%"REG_a
2024
    );
2025
}
2026
#endif
2027

    
2028
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2029
{
2030
#ifdef HAVE_MMX
2031
    bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
2032
#else
2033
    int i;
2034
    for (i=0; i<width; i++)
2035
    {
2036
        int b= src[i*3+0];
2037
        int g= src[i*3+1];
2038
        int r= src[i*3+2];
2039

    
2040
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2041
    }
2042
#endif /* HAVE_MMX */
2043
}
2044

    
2045
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2046
{
2047
#ifdef HAVE_MMX
2048
    bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
2049
#else
2050
    int i;
2051
    for (i=0; i<width; i++)
2052
    {
2053
        int b= src1[3*i + 0];
2054
        int g= src1[3*i + 1];
2055
        int r= src1[3*i + 2];
2056

    
2057
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2059
    }
2060
#endif /* HAVE_MMX */
2061
    assert(src1 == src2);
2062
}
2063

    
2064
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2065
{
2066
    int i;
2067
    for (i=0; i<width; i++)
2068
    {
2069
        int b= src1[6*i + 0] + src1[6*i + 3];
2070
        int g= src1[6*i + 1] + src1[6*i + 4];
2071
        int r= src1[6*i + 2] + src1[6*i + 5];
2072

    
2073
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2075
    }
2076
    assert(src1 == src2);
2077
}
2078

    
2079
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
2080
{
2081
#ifdef HAVE_MMX
2082
    bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2083
#else
2084
    int i;
2085
    for (i=0; i<width; i++)
2086
    {
2087
        int r= src[i*3+0];
2088
        int g= src[i*3+1];
2089
        int b= src[i*3+2];
2090

    
2091
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2092
    }
2093
#endif
2094
}
2095

    
2096
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2097
{
2098
    int i;
2099
    assert(src1==src2);
2100
#ifdef HAVE_MMX
2101
    bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2102
#else
2103
    for (i=0; i<width; i++)
2104
    {
2105
        int r= src1[3*i + 0];
2106
        int g= src1[3*i + 1];
2107
        int b= src1[3*i + 2];
2108

    
2109
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2111
    }
2112
#endif
2113
}
2114

    
2115
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2116
{
2117
    int i;
2118
    assert(src1==src2);
2119
    for (i=0; i<width; i++)
2120
    {
2121
        int r= src1[6*i + 0] + src1[6*i + 3];
2122
        int g= src1[6*i + 1] + src1[6*i + 4];
2123
        int b= src1[6*i + 2] + src1[6*i + 5];
2124

    
2125
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2127
    }
2128
}
2129

    
2130

    
2131
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2132
{
2133
    int i;
2134
    for (i=0; i<width; i++)
2135
    {
2136
        int d= src[i];
2137

    
2138
        dst[i]= pal[d] & 0xFF;
2139
    }
2140
}
2141

    
2142
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2143
{
2144
    int i;
2145
    assert(src1 == src2);
2146
    for (i=0; i<width; i++)
2147
    {
2148
        int p= pal[src1[i]];
2149

    
2150
        dstU[i]= p>>8;
2151
        dstV[i]= p>>16;
2152
    }
2153
}
2154

    
2155
static inline void RENAME(mono2Y)(uint8_t *dst, uint8_t *src, long width, int format)
2156
{
2157
    int i, j;
2158
    for (i=0; i<width/8; i++){
2159
        int d= format == PIX_FMT_MONOBLACK ? src[i] : ~src[i];
2160
        for(j=0; j<8; j++)
2161
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2162
    }
2163
}
2164

    
2165
// bilinear / bicubic scaling
2166
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2167
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2168
{
2169
#ifdef HAVE_MMX
2170
    assert(filterSize % 4 == 0 && filterSize>0);
2171
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2172
    {
2173
        long counter= -2*dstW;
2174
        filter-= counter*2;
2175
        filterPos-= counter/2;
2176
        dst-= counter/2;
2177
        asm volatile(
2178
#if defined(PIC)
2179
        "push            %%"REG_b"              \n\t"
2180
#endif
2181
        "pxor                %%mm7, %%mm7       \n\t"
2182
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2183
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2184
        ASMALIGN(4)
2185
        "1:                                     \n\t"
2186
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2187
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2188
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2189
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2190
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2191
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2192
        "punpcklbw           %%mm7, %%mm0       \n\t"
2193
        "punpcklbw           %%mm7, %%mm2       \n\t"
2194
        "pmaddwd             %%mm1, %%mm0       \n\t"
2195
        "pmaddwd             %%mm2, %%mm3       \n\t"
2196
        "movq                %%mm0, %%mm4       \n\t"
2197
        "punpckldq           %%mm3, %%mm0       \n\t"
2198
        "punpckhdq           %%mm3, %%mm4       \n\t"
2199
        "paddd               %%mm4, %%mm0       \n\t"
2200
        "psrad                  $7, %%mm0       \n\t"
2201
        "packssdw            %%mm0, %%mm0       \n\t"
2202
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2203
        "add                    $4, %%"REG_BP"  \n\t"
2204
        " jnc                   1b              \n\t"
2205

    
2206
        "pop            %%"REG_BP"              \n\t"
2207
#if defined(PIC)
2208
        "pop             %%"REG_b"              \n\t"
2209
#endif
2210
        : "+a" (counter)
2211
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2212
#if !defined(PIC)
2213
        : "%"REG_b
2214
#endif
2215
        );
2216
    }
2217
    else if (filterSize==8)
2218
    {
2219
        long counter= -2*dstW;
2220
        filter-= counter*4;
2221
        filterPos-= counter/2;
2222
        dst-= counter/2;
2223
        asm volatile(
2224
#if defined(PIC)
2225
        "push             %%"REG_b"             \n\t"
2226
#endif
2227
        "pxor                 %%mm7, %%mm7      \n\t"
2228
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2229
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2230
        ASMALIGN(4)
2231
        "1:                                     \n\t"
2232
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2233
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2234
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2235
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2236
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2237
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2238
        "punpcklbw            %%mm7, %%mm0      \n\t"
2239
        "punpcklbw            %%mm7, %%mm2      \n\t"
2240
        "pmaddwd              %%mm1, %%mm0      \n\t"
2241
        "pmaddwd              %%mm2, %%mm3      \n\t"
2242

    
2243
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2244
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2245
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2246
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2247
        "punpcklbw            %%mm7, %%mm4      \n\t"
2248
        "punpcklbw            %%mm7, %%mm2      \n\t"
2249
        "pmaddwd              %%mm1, %%mm4      \n\t"
2250
        "pmaddwd              %%mm2, %%mm5      \n\t"
2251
        "paddd                %%mm4, %%mm0      \n\t"
2252
        "paddd                %%mm5, %%mm3      \n\t"
2253
        "movq                 %%mm0, %%mm4      \n\t"
2254
        "punpckldq            %%mm3, %%mm0      \n\t"
2255
        "punpckhdq            %%mm3, %%mm4      \n\t"
2256
        "paddd                %%mm4, %%mm0      \n\t"
2257
        "psrad                   $7, %%mm0      \n\t"
2258
        "packssdw             %%mm0, %%mm0      \n\t"
2259
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2260
        "add                     $4, %%"REG_BP" \n\t"
2261
        " jnc                    1b             \n\t"
2262

    
2263
        "pop             %%"REG_BP"             \n\t"
2264
#if defined(PIC)
2265
        "pop              %%"REG_b"             \n\t"
2266
#endif
2267
        : "+a" (counter)
2268
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2269
#if !defined(PIC)
2270
        : "%"REG_b
2271
#endif
2272
        );
2273
    }
2274
    else
2275
    {
2276
        uint8_t *offset = src+filterSize;
2277
        long counter= -2*dstW;
2278
        //filter-= counter*filterSize/2;
2279
        filterPos-= counter/2;
2280
        dst-= counter/2;
2281
        asm volatile(
2282
        "pxor                  %%mm7, %%mm7     \n\t"
2283
        ASMALIGN(4)
2284
        "1:                                     \n\t"
2285
        "mov                      %2, %%"REG_c" \n\t"
2286
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2287
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2288
        "mov                      %5, %%"REG_c" \n\t"
2289
        "pxor                  %%mm4, %%mm4     \n\t"
2290
        "pxor                  %%mm5, %%mm5     \n\t"
2291
        "2:                                     \n\t"
2292
        "movq                   (%1), %%mm1     \n\t"
2293
        "movq               (%1, %6), %%mm3     \n\t"
2294
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2295
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2296
        "punpcklbw             %%mm7, %%mm0     \n\t"
2297
        "punpcklbw             %%mm7, %%mm2     \n\t"
2298
        "pmaddwd               %%mm1, %%mm0     \n\t"
2299
        "pmaddwd               %%mm2, %%mm3     \n\t"
2300
        "paddd                 %%mm3, %%mm5     \n\t"
2301
        "paddd                 %%mm0, %%mm4     \n\t"
2302
        "add                      $8, %1        \n\t"
2303
        "add                      $4, %%"REG_c" \n\t"
2304
        "cmp                      %4, %%"REG_c" \n\t"
2305
        " jb                      2b            \n\t"
2306
        "add                      %6, %1        \n\t"
2307
        "movq                  %%mm4, %%mm0     \n\t"
2308
        "punpckldq             %%mm5, %%mm4     \n\t"
2309
        "punpckhdq             %%mm5, %%mm0     \n\t"
2310
        "paddd                 %%mm0, %%mm4     \n\t"
2311
        "psrad                    $7, %%mm4     \n\t"
2312
        "packssdw              %%mm4, %%mm4     \n\t"
2313
        "mov                      %3, %%"REG_a" \n\t"
2314
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2315
        "add                      $4, %0        \n\t"
2316
        " jnc                     1b            \n\t"
2317

    
2318
        : "+r" (counter), "+r" (filter)
2319
        : "m" (filterPos), "m" (dst), "m"(offset),
2320
          "m" (src), "r" (filterSize*2)
2321
        : "%"REG_a, "%"REG_c, "%"REG_d
2322
        );
2323
    }
2324
#else
2325
#ifdef HAVE_ALTIVEC
2326
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2327
#else
2328
    int i;
2329
    for (i=0; i<dstW; i++)
2330
    {
2331
        int j;
2332
        int srcPos= filterPos[i];
2333
        int val=0;
2334
        //printf("filterPos: %d\n", filterPos[i]);
2335
        for (j=0; j<filterSize; j++)
2336
        {
2337
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2338
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2339
        }
2340
        //filter += hFilterSize;
2341
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2342
        //dst[i] = val>>7;
2343
    }
2344
#endif /* HAVE_ALTIVEC */
2345
#endif /* HAVE_MMX */
2346
}
2347
      // *** horizontal scale Y line to temp buffer
2348
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2349
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2350
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2351
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2352
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2353
{
2354
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2355
    {
2356
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2357
        src= formatConvBuffer;
2358
    }
2359
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2360
    {
2361
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2362
        src= formatConvBuffer;
2363
    }
2364
    else if (srcFormat==PIX_FMT_RGB32)
2365
    {
2366
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2367
        src= formatConvBuffer;
2368
    }
2369
    else if (srcFormat==PIX_FMT_RGB32_1)
2370
    {
2371
        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2372
        src= formatConvBuffer;
2373
    }
2374
    else if (srcFormat==PIX_FMT_BGR24)
2375
    {
2376
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2377
        src= formatConvBuffer;
2378
    }
2379
    else if (srcFormat==PIX_FMT_BGR565)
2380
    {
2381
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2382
        src= formatConvBuffer;
2383
    }
2384
    else if (srcFormat==PIX_FMT_BGR555)
2385
    {
2386
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2387
        src= formatConvBuffer;
2388
    }
2389
    else if (srcFormat==PIX_FMT_BGR32)
2390
    {
2391
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2392
        src= formatConvBuffer;
2393
    }
2394
    else if (srcFormat==PIX_FMT_BGR32_1)
2395
    {
2396
        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2397
        src= formatConvBuffer;
2398
    }
2399
    else if (srcFormat==PIX_FMT_RGB24)
2400
    {
2401
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2402
        src= formatConvBuffer;
2403
    }
2404
    else if (srcFormat==PIX_FMT_RGB565)
2405
    {
2406
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2407
        src= formatConvBuffer;
2408
    }
2409
    else if (srcFormat==PIX_FMT_RGB555)
2410
    {
2411
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2412
        src= formatConvBuffer;
2413
    }
2414
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2415
    {
2416
        RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2417
        src= formatConvBuffer;
2418
    }
2419
    else if (srcFormat==PIX_FMT_MONOBLACK ||srcFormat==PIX_FMT_MONOWHITE)
2420
    {
2421
        RENAME(mono2Y)(formatConvBuffer, src, srcW, srcFormat);
2422
        src= formatConvBuffer;
2423
    }
2424

    
2425
#ifdef HAVE_MMX
2426
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2427
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2428
#else
2429
    if (!(flags&SWS_FAST_BILINEAR))
2430
#endif
2431
    {
2432
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2433
    }
2434
    else // fast bilinear upscale / crap downscale
2435
    {
2436
#if defined(ARCH_X86)
2437
#ifdef HAVE_MMX2
2438
        int i;
2439
#if defined(PIC)
2440
        uint64_t ebxsave __attribute__((aligned(8)));
2441
#endif
2442
        if (canMMX2BeUsed)
2443
        {
2444
            asm volatile(
2445
#if defined(PIC)
2446
            "mov               %%"REG_b", %5        \n\t"
2447
#endif
2448
            "pxor                  %%mm7, %%mm7     \n\t"
2449
            "mov                      %0, %%"REG_c" \n\t"
2450
            "mov                      %1, %%"REG_D" \n\t"
2451
            "mov                      %2, %%"REG_d" \n\t"
2452
            "mov                      %3, %%"REG_b" \n\t"
2453
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2454
            PREFETCH"        (%%"REG_c")            \n\t"
2455
            PREFETCH"      32(%%"REG_c")            \n\t"
2456
            PREFETCH"      64(%%"REG_c")            \n\t"
2457

    
2458
#ifdef ARCH_X86_64
2459

    
2460
#define FUNNY_Y_CODE \
2461
            "movl            (%%"REG_b"), %%esi     \n\t"\
2462
            "call                    *%4            \n\t"\
2463
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2464
            "add               %%"REG_S", %%"REG_c" \n\t"\
2465
            "add               %%"REG_a", %%"REG_D" \n\t"\
2466
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2467

    
2468
#else
2469

    
2470
#define FUNNY_Y_CODE \
2471
            "movl (%%"REG_b"), %%esi        \n\t"\
2472
            "call         *%4                       \n\t"\
2473
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2474
            "add               %%"REG_a", %%"REG_D" \n\t"\
2475
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2476

    
2477
#endif /* ARCH_X86_64 */
2478

    
2479
FUNNY_Y_CODE
2480
FUNNY_Y_CODE
2481
FUNNY_Y_CODE
2482
FUNNY_Y_CODE
2483
FUNNY_Y_CODE
2484
FUNNY_Y_CODE
2485
FUNNY_Y_CODE
2486
FUNNY_Y_CODE
2487

    
2488
#if defined(PIC)
2489
            "mov                      %5, %%"REG_b" \n\t"
2490
#endif
2491
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2492
            "m" (funnyYCode)
2493
#if defined(PIC)
2494
            ,"m" (ebxsave)
2495
#endif
2496
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2497
#if !defined(PIC)
2498
            ,"%"REG_b
2499
#endif
2500
            );
2501
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2502
        }
2503
        else
2504
        {
2505
#endif /* HAVE_MMX2 */
2506
        long xInc_shr16 = xInc >> 16;
2507
        uint16_t xInc_mask = xInc & 0xffff;
2508
        //NO MMX just normal asm ...
2509
        asm volatile(
2510
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2511
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2512
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2513
        ASMALIGN(4)
2514
        "1:                                  \n\t"
2515
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2516
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2517
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2518
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2519
        "shll      $16, %%edi                \n\t"
2520
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2521
        "mov        %1, %%"REG_D"            \n\t"
2522
        "shrl       $9, %%esi                \n\t"
2523
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2524
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2525
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2526

    
2527
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2528
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2529
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2530
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2531
        "shll      $16, %%edi                \n\t"
2532
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2533
        "mov        %1, %%"REG_D"            \n\t"
2534
        "shrl       $9, %%esi                \n\t"
2535
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2536
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2537
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2538

    
2539

    
2540
        "add        $2, %%"REG_a"            \n\t"
2541
        "cmp        %2, %%"REG_a"            \n\t"
2542
        " jb        1b                       \n\t"
2543

    
2544

    
2545
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2546
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2547
        );
2548
#ifdef HAVE_MMX2
2549
        } //if MMX2 can't be used
2550
#endif
2551
#else
2552
        int i;
2553
        unsigned int xpos=0;
2554
        for (i=0;i<dstWidth;i++)
2555
        {
2556
            register unsigned int xx=xpos>>16;
2557
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2558
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2559
            xpos+=xInc;
2560
        }
2561
#endif /* defined(ARCH_X86) */
2562
    }
2563

    
2564
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2565
        int i;
2566
        //FIXME all pal and rgb srcFormats could do this convertion as well
2567
        //FIXME all scalers more complex than bilinear could do half of this transform
2568
        if(c->srcRange){
2569
            for (i=0; i<dstWidth; i++)
2570
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2571
        }else{
2572
            for (i=0; i<dstWidth; i++)
2573
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2574
        }
2575
    }
2576
}
2577

    
2578
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2579
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2580
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2581
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2582
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2583
{
2584
    if (srcFormat==PIX_FMT_YUYV422)
2585
    {
2586
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2587
        src1= formatConvBuffer;
2588
        src2= formatConvBuffer+VOFW;
2589
    }
2590
    else if (srcFormat==PIX_FMT_UYVY422)
2591
    {
2592
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2593
        src1= formatConvBuffer;
2594
        src2= formatConvBuffer+VOFW;
2595
    }
2596
    else if (srcFormat==PIX_FMT_RGB32)
2597
    {
2598
        if(c->chrSrcHSubSample)
2599
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2600
        else
2601
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2602
        src1= formatConvBuffer;
2603
        src2= formatConvBuffer+VOFW;
2604
    }
2605
    else if (srcFormat==PIX_FMT_RGB32_1)
2606
    {
2607
        if(c->chrSrcHSubSample)
2608
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2609
        else
2610
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2611
        src1= formatConvBuffer;
2612
        src2= formatConvBuffer+VOFW;
2613
    }
2614
    else if (srcFormat==PIX_FMT_BGR24)
2615
    {
2616
        if(c->chrSrcHSubSample)
2617
            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2618
        else
2619
            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2620
        src1= formatConvBuffer;
2621
        src2= formatConvBuffer+VOFW;
2622
    }
2623
    else if (srcFormat==PIX_FMT_BGR565)
2624
    {
2625
        if(c->chrSrcHSubSample)
2626
            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2627
        else
2628
            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2629
        src1= formatConvBuffer;
2630
        src2= formatConvBuffer+VOFW;
2631
    }
2632
    else if (srcFormat==PIX_FMT_BGR555)
2633
    {
2634
        if(c->chrSrcHSubSample)
2635
            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2636
        else
2637
            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2638
        src1= formatConvBuffer;
2639
        src2= formatConvBuffer+VOFW;
2640
    }
2641
    else if (srcFormat==PIX_FMT_BGR32)
2642
    {
2643
        if(c->chrSrcHSubSample)
2644
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2645
        else
2646
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2647
        src1= formatConvBuffer;
2648
        src2= formatConvBuffer+VOFW;
2649
    }
2650
    else if (srcFormat==PIX_FMT_BGR32_1)
2651
    {
2652
        if(c->chrSrcHSubSample)
2653
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2654
        else
2655
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2656
        src1= formatConvBuffer;
2657
        src2= formatConvBuffer+VOFW;
2658
    }
2659
    else if (srcFormat==PIX_FMT_RGB24)
2660
    {
2661
        if(c->chrSrcHSubSample)
2662
            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2663
        else
2664
            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2665
        src1= formatConvBuffer;
2666
        src2= formatConvBuffer+VOFW;
2667
    }
2668
    else if (srcFormat==PIX_FMT_RGB565)
2669
    {
2670
        if(c->chrSrcHSubSample)
2671
            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2672
        else
2673
            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2674
        src1= formatConvBuffer;
2675
        src2= formatConvBuffer+VOFW;
2676
    }
2677
    else if (srcFormat==PIX_FMT_RGB555)
2678
    {
2679
        if(c->chrSrcHSubSample)
2680
            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2681
        else
2682
            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2683
        src1= formatConvBuffer;
2684
        src2= formatConvBuffer+VOFW;
2685
    }
2686
    else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2687
    {
2688
        return;
2689
    }
2690
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2691
    {
2692
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2693
        src1= formatConvBuffer;
2694
        src2= formatConvBuffer+VOFW;
2695
    }
2696

    
2697
#ifdef HAVE_MMX
2698
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2699
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2700
#else
2701
    if (!(flags&SWS_FAST_BILINEAR))
2702
#endif
2703
    {
2704
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2705
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2706
    }
2707
    else // fast bilinear upscale / crap downscale
2708
    {
2709
#if defined(ARCH_X86)
2710
#ifdef HAVE_MMX2
2711
        int i;
2712
#if defined(PIC)
2713
        uint64_t ebxsave __attribute__((aligned(8)));
2714
#endif
2715
        if (canMMX2BeUsed)
2716
        {
2717
            asm volatile(
2718
#if defined(PIC)
2719
            "mov          %%"REG_b", %6         \n\t"
2720
#endif
2721
            "pxor             %%mm7, %%mm7      \n\t"
2722
            "mov                 %0, %%"REG_c"  \n\t"
2723
            "mov                 %1, %%"REG_D"  \n\t"
2724
            "mov                 %2, %%"REG_d"  \n\t"
2725
            "mov                 %3, %%"REG_b"  \n\t"
2726
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2727
            PREFETCH"   (%%"REG_c")             \n\t"
2728
            PREFETCH" 32(%%"REG_c")             \n\t"
2729
            PREFETCH" 64(%%"REG_c")             \n\t"
2730

    
2731
#ifdef ARCH_X86_64
2732

    
2733
#define FUNNY_UV_CODE \
2734
            "movl       (%%"REG_b"), %%esi      \n\t"\
2735
            "call               *%4             \n\t"\
2736
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2737
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2738
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2739
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2740

    
2741
#else
2742

    
2743
#define FUNNY_UV_CODE \
2744
            "movl       (%%"REG_b"), %%esi      \n\t"\
2745
            "call               *%4             \n\t"\
2746
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2747
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2748
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2749

    
2750
#endif /* ARCH_X86_64 */
2751

    
2752
FUNNY_UV_CODE
2753
FUNNY_UV_CODE
2754
FUNNY_UV_CODE
2755
FUNNY_UV_CODE
2756
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2757
            "mov                 %5, %%"REG_c"  \n\t" // src
2758
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2759
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2760
            PREFETCH"   (%%"REG_c")             \n\t"
2761
            PREFETCH" 32(%%"REG_c")             \n\t"
2762
            PREFETCH" 64(%%"REG_c")             \n\t"
2763

    
2764
FUNNY_UV_CODE
2765
FUNNY_UV_CODE
2766
FUNNY_UV_CODE
2767
FUNNY_UV_CODE
2768

    
2769
#if defined(PIC)
2770
            "mov %6, %%"REG_b"    \n\t"
2771
#endif
2772
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2773
            "m" (funnyUVCode), "m" (src2)
2774
#if defined(PIC)
2775
            ,"m" (ebxsave)
2776
#endif
2777
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2778
#if !defined(PIC)
2779
             ,"%"REG_b
2780
#endif
2781
            );
2782
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2783
            {
2784
                //printf("%d %d %d\n", dstWidth, i, srcW);
2785
                dst[i] = src1[srcW-1]*128;
2786
                dst[i+VOFW] = src2[srcW-1]*128;
2787
            }
2788
        }
2789
        else
2790
        {
2791
#endif /* HAVE_MMX2 */
2792
            long xInc_shr16 = (long) (xInc >> 16);
2793
            uint16_t xInc_mask = xInc & 0xffff;
2794
            asm volatile(
2795
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2796
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2797
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2798
            ASMALIGN(4)
2799
            "1:                                     \n\t"
2800
            "mov        %0, %%"REG_S"               \n\t"
2801
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2802
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2803
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2804
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2805
            "shll      $16, %%edi                   \n\t"
2806
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2807
            "mov        %1, %%"REG_D"               \n\t"
2808
            "shrl       $9, %%esi                   \n\t"
2809
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2810

    
2811
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2812
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2813
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2814
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2815
            "shll      $16, %%edi                   \n\t"
2816
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2817
            "mov        %1, %%"REG_D"               \n\t"
2818
            "shrl       $9, %%esi                   \n\t"
2819
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2820

    
2821
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2822
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2823
            "add        $1, %%"REG_a"               \n\t"
2824
            "cmp        %2, %%"REG_a"               \n\t"
2825
            " jb        1b                          \n\t"
2826

    
2827
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2828
   which is needed to support GCC 4.0. */
2829
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2830
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2831
#else
2832
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2833
#endif
2834
            "r" (src2)
2835
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2836
            );
2837
#ifdef HAVE_MMX2
2838
        } //if MMX2 can't be used
2839
#endif
2840
#else
2841
        int i;
2842
        unsigned int xpos=0;
2843
        for (i=0;i<dstWidth;i++)
2844
        {
2845
            register unsigned int xx=xpos>>16;
2846
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2847
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2848
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2849
            /* slower
2850
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2851
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2852
            */
2853
            xpos+=xInc;
2854
        }
2855
#endif /* defined(ARCH_X86) */
2856
    }
2857
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2858
        int i;
2859
        //FIXME all pal and rgb srcFormats could do this convertion as well
2860
        //FIXME all scalers more complex than bilinear could do half of this transform
2861
        if(c->srcRange){
2862
            for (i=0; i<dstWidth; i++){
2863
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2864
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2865
            }
2866
        }else{
2867
            for (i=0; i<dstWidth; i++){
2868
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2869
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2870
            }
2871
        }
2872
    }
2873
}
2874

    
2875
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2876
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2877

    
2878
    /* load a few things into local vars to make the code more readable? and faster */
2879
    const int srcW= c->srcW;
2880
    const int dstW= c->dstW;
2881
    const int dstH= c->dstH;
2882
    const int chrDstW= c->chrDstW;
2883
    const int chrSrcW= c->chrSrcW;
2884
    const int lumXInc= c->lumXInc;
2885
    const int chrXInc= c->chrXInc;
2886
    const int dstFormat= c->dstFormat;
2887
    const int srcFormat= c->srcFormat;
2888
    const int flags= c->flags;
2889
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2890
    int16_t *vLumFilterPos= c->vLumFilterPos;
2891
    int16_t *vChrFilterPos= c->vChrFilterPos;
2892
    int16_t *hLumFilterPos= c->hLumFilterPos;
2893
    int16_t *hChrFilterPos= c->hChrFilterPos;
2894
    int16_t *vLumFilter= c->vLumFilter;
2895
    int16_t *vChrFilter= c->vChrFilter;
2896
    int16_t *hLumFilter= c->hLumFilter;
2897
    int16_t *hChrFilter= c->hChrFilter;
2898
    int32_t *lumMmxFilter= c->lumMmxFilter;
2899
    int32_t *chrMmxFilter= c->chrMmxFilter;
2900
    const int vLumFilterSize= c->vLumFilterSize;
2901
    const int vChrFilterSize= c->vChrFilterSize;
2902
    const int hLumFilterSize= c->hLumFilterSize;
2903
    const int hChrFilterSize= c->hChrFilterSize;
2904
    int16_t **lumPixBuf= c->lumPixBuf;
2905
    int16_t **chrPixBuf= c->chrPixBuf;
2906
    const int vLumBufSize= c->vLumBufSize;
2907
    const int vChrBufSize= c->vChrBufSize;
2908
    uint8_t *funnyYCode= c->funnyYCode;
2909
    uint8_t *funnyUVCode= c->funnyUVCode;
2910
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2911
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2912
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2913
    int lastDstY;
2914
    uint8_t *pal=NULL;
2915

    
2916
    /* vars which will change and which we need to store back in the context */
2917
    int dstY= c->dstY;
2918
    int lumBufIndex= c->lumBufIndex;
2919
    int chrBufIndex= c->chrBufIndex;
2920
    int lastInLumBuf= c->lastInLumBuf;
2921
    int lastInChrBuf= c->lastInChrBuf;
2922

    
2923
    if (isPacked(c->srcFormat)){
2924
        pal= src[1];
2925
        src[0]=
2926
        src[1]=
2927
        src[2]= src[0];
2928
        srcStride[0]=
2929
        srcStride[1]=
2930
        srcStride[2]= srcStride[0];
2931
    }
2932
    srcStride[1]<<= c->vChrDrop;
2933
    srcStride[2]<<= c->vChrDrop;
2934

    
2935
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2936
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2937

    
2938
#if 0 //self test FIXME move to a vfilter or something
2939
    {
2940
    static volatile int i=0;
2941
    i++;
2942
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2943
        selfTest(src, srcStride, c->srcW, c->srcH);
2944
    i--;
2945
    }
2946
#endif
2947

    
2948
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2949
    //dstStride[0],dstStride[1],dstStride[2]);
2950

    
2951
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2952
    {
2953
        static int firstTime=1; //FIXME move this into the context perhaps
2954
        if (flags & SWS_PRINT_INFO && firstTime)
2955
        {
2956
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2957
                   "         ->cannot do aligned memory accesses anymore\n");
2958
            firstTime=0;
2959
        }
2960
    }
2961

    
2962
    /* Note the user might start scaling the picture in the middle so this
2963
       will not get executed. This is not really intended but works
2964
       currently, so people might do it. */
2965
    if (srcSliceY ==0){
2966
        lumBufIndex=0;
2967
        chrBufIndex=0;
2968
        dstY=0;
2969
        lastInLumBuf= -1;
2970
        lastInChrBuf= -1;
2971
    }
2972

    
2973
    lastDstY= dstY;
2974

    
2975
    for (;dstY < dstH; dstY++){
2976
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2977
        const int chrDstY= dstY>>c->chrDstVSubSample;
2978
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2979
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2980

    
2981
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2982
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2983
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2984
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2985

    
2986
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2987
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2988
        //handle holes (FAST_BILINEAR & weird filters)
2989
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2990
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2991
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2992
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2993
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2994

    
2995
        // Do we have enough lines in this slice to output the dstY line
2996
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2997
        {
2998
            //Do horizontal scaling
2999
            while(lastInLumBuf < lastLumSrcY)
3000
            {
3001
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3002
                lumBufIndex++;
3003
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3004
                assert(lumBufIndex < 2*vLumBufSize);
3005
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3006
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3007
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3008
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3009
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3010
                                funnyYCode, c->srcFormat, formatConvBuffer,
3011
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3012
                lastInLumBuf++;
3013
            }
3014
            while(lastInChrBuf < lastChrSrcY)
3015
            {
3016
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3017
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3018
                chrBufIndex++;
3019
                assert(chrBufIndex < 2*vChrBufSize);
3020
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3021
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3022
                //FIXME replace parameters through context struct (some at least)
3023

    
3024
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3025
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3026
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3027
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3028
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3029
                lastInChrBuf++;
3030
            }
3031
            //wrap buf index around to stay inside the ring buffer
3032
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3033
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3034
        }
3035
        else // not enough lines left in this slice -> load the rest in the buffer
3036
        {
3037
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3038
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3039
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3040
            vChrBufSize, vLumBufSize);*/
3041

    
3042
            //Do horizontal scaling
3043
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3044
            {
3045
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3046
                lumBufIndex++;
3047
                assert(lumBufIndex < 2*vLumBufSize);
3048
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3049
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3050
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3051
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3052
                                funnyYCode, c->srcFormat, formatConvBuffer,
3053
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3054
                lastInLumBuf++;
3055
            }
3056
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3057
            {
3058
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3059
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3060
                chrBufIndex++;
3061
                assert(chrBufIndex < 2*vChrBufSize);
3062
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3063
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3064

    
3065
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3066
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3067
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3068
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3069
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3070
                lastInChrBuf++;
3071
            }
3072
            //wrap buf index around to stay inside the ring buffer
3073
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3074
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3075
            break; //we can't output a dstY line so let's try with the next slice
3076
        }
3077

    
3078
#ifdef HAVE_MMX
3079
        b5Dither= ff_dither8[dstY&1];
3080
        g6Dither= ff_dither4[dstY&1];
3081
        g5Dither= ff_dither8[dstY&1];
3082
        r5Dither= ff_dither8[(dstY+1)&1];
3083
#endif
3084
        if (dstY < dstH-2)
3085
        {
3086
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3087
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3088
#ifdef HAVE_MMX
3089
            int i;
3090
        if (flags & SWS_ACCURATE_RND){
3091
            int s= APCK_SIZE / 8;
3092
            for (i=0; i<vLumFilterSize; i+=2){
3093
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
3094
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
3095
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
3096
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
3097
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3098
            }
3099
            for (i=0; i<vChrFilterSize; i+=2){
3100
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
3101
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
3102
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
3103
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3104
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3105
            }
3106
        }else{
3107
            for (i=0; i<vLumFilterSize; i++)
3108
            {
3109
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3110
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3111
                lumMmxFilter[4*i+2]=
3112
                lumMmxFilter[4*i+3]=
3113
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3114
            }
3115
            for (i=0; i<vChrFilterSize; i++)
3116
            {
3117
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3118
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3119
                chrMmxFilter[4*i+2]=
3120
                chrMmxFilter[4*i+3]=
3121
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3122
            }
3123
        }
3124
#endif
3125
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3126
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3127
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3128
                RENAME(yuv2nv12X)(c,
3129
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3130
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3131
                    dest, uDest, dstW, chrDstW, dstFormat);
3132
            }
3133
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3134
            {
3135
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3136
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3137
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3138
                {
3139
                    int16_t *lumBuf = lumPixBuf[0];
3140
                    int16_t *chrBuf= chrPixBuf[0];
3141
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3142
                }
3143
                else //General YV12
3144
                {
3145
                    RENAME(yuv2yuvX)(c,
3146
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3147
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3148
                        dest, uDest, vDest, dstW, chrDstW);
3149
                }
3150
            }
3151
            else
3152
            {
3153
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3154
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3155
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3156
                {
3157
                    int chrAlpha= vChrFilter[2*dstY+1];
3158
                    if(flags & SWS_FULL_CHR_H_INT){
3159
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3160
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3161
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3162
                            dest, dstW, dstY);
3163
                    }else{
3164
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3165
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3166
                    }
3167
                }
3168
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3169
                {
3170
                    int lumAlpha= vLumFilter[2*dstY+1];
3171
                    int chrAlpha= vChrFilter[2*dstY+1];
3172
                    lumMmxFilter[2]=
3173
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3174
                    chrMmxFilter[2]=
3175
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3176
                    if(flags & SWS_FULL_CHR_H_INT){
3177
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3178
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3179
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3180
                            dest, dstW, dstY);
3181
                    }else{
3182
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3183
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3184
                    }
3185
                }
3186
                else //general RGB
3187
                {
3188
                    if(flags & SWS_FULL_CHR_H_INT){
3189
                        yuv2rgbXinC_full(c,
3190
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3191
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3192
                            dest, dstW, dstY);
3193
                    }else{
3194
                    RENAME(yuv2packedX)(c,
3195
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3196
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3197
                        dest, dstW, dstY);
3198
                    }
3199
                }
3200
            }
3201
        }
3202
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3203
        {
3204
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3205
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3206
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3207
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3208
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3209
                yuv2nv12XinC(
3210
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3211
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3212
                    dest, uDest, dstW, chrDstW, dstFormat);
3213
            }
3214
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3215
            {
3216
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3217
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3218
                yuv2yuvXinC(
3219
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3220
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3221
                    dest, uDest, vDest, dstW, chrDstW);
3222
            }
3223
            else
3224
            {
3225
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3226
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3227
                if(flags & SWS_FULL_CHR_H_INT){
3228
                    yuv2rgbXinC_full(c,
3229
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3230
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3231
                        dest, dstW, dstY);
3232
                }else{
3233
                yuv2packedXinC(c,
3234
                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3235
                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3236
                    dest, dstW, dstY);
3237
                }
3238
            }
3239
        }
3240
    }
3241

    
3242
#ifdef HAVE_MMX
3243
    asm volatile(SFENCE:::"memory");
3244
    asm volatile(EMMS:::"memory");
3245
#endif
3246
    /* store changed local vars back in the context */
3247
    c->dstY= dstY;
3248
    c->lumBufIndex= lumBufIndex;
3249
    c->chrBufIndex= chrBufIndex;
3250
    c->lastInLumBuf= lastInLumBuf;
3251
    c->lastInChrBuf= lastInChrBuf;
3252

    
3253
    return dstY - lastDstY;
3254
}