Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ e09d7eef

History | View | Annotate | Download (135 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddsw             %%mm7, %%mm0      \n\t"\
194
    "paddsw             %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX \
210
    asm volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232
\
233
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236
    "movq                      %%mm1, %%mm7         \n\t"\
237
    ASMALIGN(4)\
238
    "2:                                             \n\t"\
239
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242
    "add                         $16, %%"REG_d"            \n\t"\
243
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244
    "pmulhw                    %%mm0, %%mm2         \n\t"\
245
    "pmulhw                    %%mm0, %%mm5         \n\t"\
246
    "paddw                     %%mm2, %%mm1         \n\t"\
247
    "paddw                     %%mm5, %%mm7         \n\t"\
248
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
249
    " jnz                         2b                \n\t"\
250

    
251
#define YSCALEYUV2PACKEDX_END                 \
252
    :: "r" (&c->redDither),                   \
253
        "m" (dummy), "m" (dummy), "m" (dummy),\
254
        "r" (dest), "m" (dstW)                \
255
    : "%"REG_a, "%"REG_d, "%"REG_S            \
256
    );
257

    
258
#define YSCALEYUV2PACKEDX_ACCURATE \
259
    asm volatile(\
260
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
261
    ASMALIGN(4)\
262
    "nop                                            \n\t"\
263
    "1:                                             \n\t"\
264
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266
    "pxor                      %%mm4, %%mm4         \n\t"\
267
    "pxor                      %%mm5, %%mm5         \n\t"\
268
    "pxor                      %%mm6, %%mm6         \n\t"\
269
    "pxor                      %%mm7, %%mm7         \n\t"\
270
    ASMALIGN(4)\
271
    "2:                                             \n\t"\
272
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
275
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276
    "movq                      %%mm0, %%mm3         \n\t"\
277
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
278
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
279
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
280
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
282
    "paddd                     %%mm0, %%mm4         \n\t"\
283
    "paddd                     %%mm3, %%mm5         \n\t"\
284
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
286
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
287
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
288
    "movq                      %%mm2, %%mm0         \n\t"\
289
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
290
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
291
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
292
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
293
    "paddd                     %%mm2, %%mm6         \n\t"\
294
    "paddd                     %%mm0, %%mm7         \n\t"\
295
    " jnz                         2b                \n\t"\
296
    "psrad                       $16, %%mm4         \n\t"\
297
    "psrad                       $16, %%mm5         \n\t"\
298
    "psrad                       $16, %%mm6         \n\t"\
299
    "psrad                       $16, %%mm7         \n\t"\
300
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301
    "packssdw                  %%mm5, %%mm4         \n\t"\
302
    "packssdw                  %%mm7, %%mm6         \n\t"\
303
    "paddw                     %%mm0, %%mm4         \n\t"\
304
    "paddw                     %%mm0, %%mm6         \n\t"\
305
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307
\
308
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310
    "pxor                      %%mm1, %%mm1         \n\t"\
311
    "pxor                      %%mm5, %%mm5         \n\t"\
312
    "pxor                      %%mm7, %%mm7         \n\t"\
313
    "pxor                      %%mm6, %%mm6         \n\t"\
314
    ASMALIGN(4)\
315
    "2:                                             \n\t"\
316
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
319
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320
    "movq                      %%mm0, %%mm3         \n\t"\
321
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
322
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
323
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
325
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
326
    "paddd                     %%mm0, %%mm1         \n\t"\
327
    "paddd                     %%mm3, %%mm5         \n\t"\
328
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
330
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
331
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
332
    "movq                      %%mm2, %%mm0         \n\t"\
333
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
334
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
335
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
336
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
337
    "paddd                     %%mm2, %%mm7         \n\t"\
338
    "paddd                     %%mm0, %%mm6         \n\t"\
339
    " jnz                         2b                \n\t"\
340
    "psrad                       $16, %%mm1         \n\t"\
341
    "psrad                       $16, %%mm5         \n\t"\
342
    "psrad                       $16, %%mm7         \n\t"\
343
    "psrad                       $16, %%mm6         \n\t"\
344
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345
    "packssdw                  %%mm5, %%mm1         \n\t"\
346
    "packssdw                  %%mm6, %%mm7         \n\t"\
347
    "paddw                     %%mm0, %%mm1         \n\t"\
348
    "paddw                     %%mm0, %%mm7         \n\t"\
349
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351

    
352
#define YSCALEYUV2RGBX \
353
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367
    "paddw           %%mm3, %%mm4       \n\t"\
368
    "movq            %%mm2, %%mm0       \n\t"\
369
    "movq            %%mm5, %%mm6       \n\t"\
370
    "movq            %%mm4, %%mm3       \n\t"\
371
    "punpcklwd       %%mm2, %%mm2       \n\t"\
372
    "punpcklwd       %%mm5, %%mm5       \n\t"\
373
    "punpcklwd       %%mm4, %%mm4       \n\t"\
374
    "paddw           %%mm1, %%mm2       \n\t"\
375
    "paddw           %%mm1, %%mm5       \n\t"\
376
    "paddw           %%mm1, %%mm4       \n\t"\
377
    "punpckhwd       %%mm0, %%mm0       \n\t"\
378
    "punpckhwd       %%mm6, %%mm6       \n\t"\
379
    "punpckhwd       %%mm3, %%mm3       \n\t"\
380
    "paddw           %%mm7, %%mm0       \n\t"\
381
    "paddw           %%mm7, %%mm6       \n\t"\
382
    "paddw           %%mm7, %%mm3       \n\t"\
383
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384
    "packuswb        %%mm0, %%mm2       \n\t"\
385
    "packuswb        %%mm6, %%mm5       \n\t"\
386
    "packuswb        %%mm3, %%mm4       \n\t"\
387
    "pxor            %%mm7, %%mm7       \n\t"
388
#if 0
389
#define FULL_YSCALEYUV2RGB \
390
    "pxor                 %%mm7, %%mm7  \n\t"\
391
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392
    "punpcklwd            %%mm6, %%mm6  \n\t"\
393
    "punpcklwd            %%mm6, %%mm6  \n\t"\
394
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395
    "punpcklwd            %%mm5, %%mm5  \n\t"\
396
    "punpcklwd            %%mm5, %%mm5  \n\t"\
397
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
398
    ASMALIGN(4)\
399
    "1:                                 \n\t"\
400
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418
\
419
\
420
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427
\
428
\
429
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434
    "packuswb             %%mm3, %%mm3  \n\t"\
435
\
436
    "packuswb             %%mm0, %%mm0  \n\t"\
437
    "paddw                %%mm4, %%mm2  \n\t"\
438
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439
\
440
    "packuswb             %%mm1, %%mm1  \n\t"
441
#endif
442

    
443
#define REAL_YSCALEYUV2PACKED(index, c) \
444
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
445
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
446
    "psraw                $3, %%mm0                           \n\t"\
447
    "psraw                $3, %%mm1                           \n\t"\
448
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450
    "xor            "#index", "#index"                        \n\t"\
451
    ASMALIGN(4)\
452
    "1:                                 \n\t"\
453
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
454
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
455
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
456
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
457
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
460
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
467
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
468
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
469
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
470
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
471
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
472
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478

    
479
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
480

    
481
#define REAL_YSCALEYUV2RGB(index, c) \
482
    "xor            "#index", "#index"  \n\t"\
483
    ASMALIGN(4)\
484
    "1:                                 \n\t"\
485
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
486
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
487
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
488
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
489
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
492
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
499
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
500
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
501
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
502
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
503
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
504
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
506
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
507
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
508
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
509
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
510
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
511
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
518
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
519
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
520
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
521
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
522
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
523
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524
    "paddw             %%mm3, %%mm4     \n\t"\
525
    "movq              %%mm2, %%mm0     \n\t"\
526
    "movq              %%mm5, %%mm6     \n\t"\
527
    "movq              %%mm4, %%mm3     \n\t"\
528
    "punpcklwd         %%mm2, %%mm2     \n\t"\
529
    "punpcklwd         %%mm5, %%mm5     \n\t"\
530
    "punpcklwd         %%mm4, %%mm4     \n\t"\
531
    "paddw             %%mm1, %%mm2     \n\t"\
532
    "paddw             %%mm1, %%mm5     \n\t"\
533
    "paddw             %%mm1, %%mm4     \n\t"\
534
    "punpckhwd         %%mm0, %%mm0     \n\t"\
535
    "punpckhwd         %%mm6, %%mm6     \n\t"\
536
    "punpckhwd         %%mm3, %%mm3     \n\t"\
537
    "paddw             %%mm7, %%mm0     \n\t"\
538
    "paddw             %%mm7, %%mm6     \n\t"\
539
    "paddw             %%mm7, %%mm3     \n\t"\
540
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541
    "packuswb          %%mm0, %%mm2     \n\t"\
542
    "packuswb          %%mm6, %%mm5     \n\t"\
543
    "packuswb          %%mm3, %%mm4     \n\t"\
544
    "pxor              %%mm7, %%mm7     \n\t"
545
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
546

    
547
#define REAL_YSCALEYUV2PACKED1(index, c) \
548
    "xor            "#index", "#index"  \n\t"\
549
    ASMALIGN(4)\
550
    "1:                                 \n\t"\
551
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
552
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
553
    "psraw                $7, %%mm3     \n\t" \
554
    "psraw                $7, %%mm4     \n\t" \
555
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
556
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
557
    "psraw                $7, %%mm1     \n\t" \
558
    "psraw                $7, %%mm7     \n\t" \
559

    
560
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
561

    
562
#define REAL_YSCALEYUV2RGB1(index, c) \
563
    "xor            "#index", "#index"  \n\t"\
564
    ASMALIGN(4)\
565
    "1:                                 \n\t"\
566
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
568
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
571
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
572
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
573
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
574
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
575
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
576
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
578
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
579
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
582
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
583
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
584
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
585
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
586
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
587
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588
    "paddw             %%mm3, %%mm4     \n\t"\
589
    "movq              %%mm2, %%mm0     \n\t"\
590
    "movq              %%mm5, %%mm6     \n\t"\
591
    "movq              %%mm4, %%mm3     \n\t"\
592
    "punpcklwd         %%mm2, %%mm2     \n\t"\
593
    "punpcklwd         %%mm5, %%mm5     \n\t"\
594
    "punpcklwd         %%mm4, %%mm4     \n\t"\
595
    "paddw             %%mm1, %%mm2     \n\t"\
596
    "paddw             %%mm1, %%mm5     \n\t"\
597
    "paddw             %%mm1, %%mm4     \n\t"\
598
    "punpckhwd         %%mm0, %%mm0     \n\t"\
599
    "punpckhwd         %%mm6, %%mm6     \n\t"\
600
    "punpckhwd         %%mm3, %%mm3     \n\t"\
601
    "paddw             %%mm7, %%mm0     \n\t"\
602
    "paddw             %%mm7, %%mm6     \n\t"\
603
    "paddw             %%mm7, %%mm3     \n\t"\
604
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605
    "packuswb          %%mm0, %%mm2     \n\t"\
606
    "packuswb          %%mm6, %%mm5     \n\t"\
607
    "packuswb          %%mm3, %%mm4     \n\t"\
608
    "pxor              %%mm7, %%mm7     \n\t"
609
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
610

    
611
#define REAL_YSCALEYUV2PACKED1b(index, c) \
612
    "xor "#index", "#index"             \n\t"\
613
    ASMALIGN(4)\
614
    "1:                                 \n\t"\
615
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
616
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
617
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
618
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
619
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621
    "psrlw                $8, %%mm3     \n\t" \
622
    "psrlw                $8, %%mm4     \n\t" \
623
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
624
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
625
    "psraw                $7, %%mm1     \n\t" \
626
    "psraw                $7, %%mm7     \n\t"
627
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
628

    
629
// do vertical chrominance interpolation
630
#define REAL_YSCALEYUV2RGB1b(index, c) \
631
    "xor            "#index", "#index"  \n\t"\
632
    ASMALIGN(4)\
633
    "1:                                 \n\t"\
634
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
635
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
636
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
637
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
638
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
641
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
642
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
643
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
644
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
645
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
646
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
647
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
648
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
650
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
651
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
654
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
655
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
656
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
657
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
658
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
659
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660
    "paddw             %%mm3, %%mm4     \n\t"\
661
    "movq              %%mm2, %%mm0     \n\t"\
662
    "movq              %%mm5, %%mm6     \n\t"\
663
    "movq              %%mm4, %%mm3     \n\t"\
664
    "punpcklwd         %%mm2, %%mm2     \n\t"\
665
    "punpcklwd         %%mm5, %%mm5     \n\t"\
666
    "punpcklwd         %%mm4, %%mm4     \n\t"\
667
    "paddw             %%mm1, %%mm2     \n\t"\
668
    "paddw             %%mm1, %%mm5     \n\t"\
669
    "paddw             %%mm1, %%mm4     \n\t"\
670
    "punpckhwd         %%mm0, %%mm0     \n\t"\
671
    "punpckhwd         %%mm6, %%mm6     \n\t"\
672
    "punpckhwd         %%mm3, %%mm3     \n\t"\
673
    "paddw             %%mm7, %%mm0     \n\t"\
674
    "paddw             %%mm7, %%mm6     \n\t"\
675
    "paddw             %%mm7, %%mm3     \n\t"\
676
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677
    "packuswb          %%mm0, %%mm2     \n\t"\
678
    "packuswb          %%mm6, %%mm5     \n\t"\
679
    "packuswb          %%mm3, %%mm4     \n\t"\
680
    "pxor              %%mm7, %%mm7     \n\t"
681
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
682

    
683
#define REAL_WRITEBGR32(dst, dstw, index) \
684
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685
    "movq      %%mm2, %%mm1     \n\t" /* B */\
686
    "movq      %%mm5, %%mm6     \n\t" /* R */\
687
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
688
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
689
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
690
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
691
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
692
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
693
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
694
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
695
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
696
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
697
\
698
    MOVNTQ(%%mm0,   (dst, index, 4))\
699
    MOVNTQ(%%mm2,  8(dst, index, 4))\
700
    MOVNTQ(%%mm1, 16(dst, index, 4))\
701
    MOVNTQ(%%mm3, 24(dst, index, 4))\
702
\
703
    "add      $8, "#index"      \n\t"\
704
    "cmp "#dstw", "#index"      \n\t"\
705
    " jb      1b                \n\t"
706
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
707

    
708
#define REAL_WRITERGB16(dst, dstw, index) \
709
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
710
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
711
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
712
    "psrlq           $3, %%mm2  \n\t"\
713
\
714
    "movq         %%mm2, %%mm1  \n\t"\
715
    "movq         %%mm4, %%mm3  \n\t"\
716
\
717
    "punpcklbw    %%mm7, %%mm3  \n\t"\
718
    "punpcklbw    %%mm5, %%mm2  \n\t"\
719
    "punpckhbw    %%mm7, %%mm4  \n\t"\
720
    "punpckhbw    %%mm5, %%mm1  \n\t"\
721
\
722
    "psllq           $3, %%mm3  \n\t"\
723
    "psllq           $3, %%mm4  \n\t"\
724
\
725
    "por          %%mm3, %%mm2  \n\t"\
726
    "por          %%mm4, %%mm1  \n\t"\
727
\
728
    MOVNTQ(%%mm2,  (dst, index, 2))\
729
    MOVNTQ(%%mm1, 8(dst, index, 2))\
730
\
731
    "add             $8, "#index"   \n\t"\
732
    "cmp        "#dstw", "#index"   \n\t"\
733
    " jb             1b             \n\t"
734
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
735

    
736
#define REAL_WRITERGB15(dst, dstw, index) \
737
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
738
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
739
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
740
    "psrlq           $3, %%mm2  \n\t"\
741
    "psrlq           $1, %%mm5  \n\t"\
742
\
743
    "movq         %%mm2, %%mm1  \n\t"\
744
    "movq         %%mm4, %%mm3  \n\t"\
745
\
746
    "punpcklbw    %%mm7, %%mm3  \n\t"\
747
    "punpcklbw    %%mm5, %%mm2  \n\t"\
748
    "punpckhbw    %%mm7, %%mm4  \n\t"\
749
    "punpckhbw    %%mm5, %%mm1  \n\t"\
750
\
751
    "psllq           $2, %%mm3  \n\t"\
752
    "psllq           $2, %%mm4  \n\t"\
753
\
754
    "por          %%mm3, %%mm2  \n\t"\
755
    "por          %%mm4, %%mm1  \n\t"\
756
\
757
    MOVNTQ(%%mm2,  (dst, index, 2))\
758
    MOVNTQ(%%mm1, 8(dst, index, 2))\
759
\
760
    "add             $8, "#index"   \n\t"\
761
    "cmp        "#dstw", "#index"   \n\t"\
762
    " jb             1b             \n\t"
763
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
764

    
765
#define WRITEBGR24OLD(dst, dstw, index) \
766
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767
    "movq      %%mm2, %%mm1             \n\t" /* B */\
768
    "movq      %%mm5, %%mm6             \n\t" /* R */\
769
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
770
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
771
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
772
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
773
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
774
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
775
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
776
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
777
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
778
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
779
\
780
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
781
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
782
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
783
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
784
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
785
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
786
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
787
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
788
\
789
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
790
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
791
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
792
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
793
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
794
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
795
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
796
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
797
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
798
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
799
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
800
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
801
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
802
\
803
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
804
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
805
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
806
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
807
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
808
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
809
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
810
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
811
\
812
    MOVNTQ(%%mm0,   (dst))\
813
    MOVNTQ(%%mm2,  8(dst))\
814
    MOVNTQ(%%mm3, 16(dst))\
815
    "add         $24, "#dst"            \n\t"\
816
\
817
    "add          $8, "#index"          \n\t"\
818
    "cmp     "#dstw", "#index"          \n\t"\
819
    " jb          1b                    \n\t"
820

    
821
#define WRITEBGR24MMX(dst, dstw, index) \
822
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
    "movq      %%mm2, %%mm1     \n\t" /* B */\
824
    "movq      %%mm5, %%mm6     \n\t" /* R */\
825
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
826
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
827
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
828
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
829
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
830
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
831
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
832
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
833
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
834
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
835
\
836
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
837
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
838
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
839
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
840
\
841
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
842
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
843
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
844
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
845
\
846
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
847
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
848
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
849
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
850
\
851
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
852
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
853
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
854
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
855
    MOVNTQ(%%mm0, (dst))\
856
\
857
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
858
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
859
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
860
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
861
    MOVNTQ(%%mm6, 8(dst))\
862
\
863
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
864
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
865
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
866
    MOVNTQ(%%mm5, 16(dst))\
867
\
868
    "add         $24, "#dst"    \n\t"\
869
\
870
    "add          $8, "#index"  \n\t"\
871
    "cmp     "#dstw", "#index"  \n\t"\
872
    " jb          1b            \n\t"
873

    
874
#define WRITEBGR24MMX2(dst, dstw, index) \
875
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
879
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
880
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
881
\
882
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
883
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
884
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
885
\
886
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
887
    "por    %%mm1, %%mm6        \n\t"\
888
    "por    %%mm3, %%mm6        \n\t"\
889
    MOVNTQ(%%mm6, (dst))\
890
\
891
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
892
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
893
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
894
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
895
\
896
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
897
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
898
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
899
\
900
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
901
    "por    %%mm3, %%mm6        \n\t"\
902
    MOVNTQ(%%mm6, 8(dst))\
903
\
904
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
905
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
906
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
907
\
908
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
909
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
910
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
911
\
912
    "por    %%mm1, %%mm3        \n\t"\
913
    "por    %%mm3, %%mm6        \n\t"\
914
    MOVNTQ(%%mm6, 16(dst))\
915
\
916
    "add      $24, "#dst"       \n\t"\
917
\
918
    "add       $8, "#index"     \n\t"\
919
    "cmp  "#dstw", "#index"     \n\t"\
920
    " jb       1b               \n\t"
921

    
922
#ifdef HAVE_MMX2
923
#undef WRITEBGR24
924
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
925
#else
926
#undef WRITEBGR24
927
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
928
#endif
929

    
930
#define REAL_WRITEYUY2(dst, dstw, index) \
931
    "packuswb  %%mm3, %%mm3     \n\t"\
932
    "packuswb  %%mm4, %%mm4     \n\t"\
933
    "packuswb  %%mm7, %%mm1     \n\t"\
934
    "punpcklbw %%mm4, %%mm3     \n\t"\
935
    "movq      %%mm1, %%mm7     \n\t"\
936
    "punpcklbw %%mm3, %%mm1     \n\t"\
937
    "punpckhbw %%mm3, %%mm7     \n\t"\
938
\
939
    MOVNTQ(%%mm1, (dst, index, 2))\
940
    MOVNTQ(%%mm7, 8(dst, index, 2))\
941
\
942
    "add          $8, "#index"  \n\t"\
943
    "cmp     "#dstw", "#index"  \n\t"\
944
    " jb          1b            \n\t"
945
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
946

    
947

    
948
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951
{
952
#ifdef HAVE_MMX
953
    if (c->flags & SWS_ACCURATE_RND){
954
        if (uDest){
955
            YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956
            YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
957
        }
958

    
959
        YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
960
    }else{
961
        if (uDest){
962
            YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963
            YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
964
        }
965

    
966
        YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
967
    }
968
#else
969
#ifdef HAVE_ALTIVEC
970
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971
                      chrFilter, chrSrc, chrFilterSize,
972
                      dest, uDest, vDest, dstW, chrDstW);
973
#else //HAVE_ALTIVEC
974
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975
            chrFilter, chrSrc, chrFilterSize,
976
            dest, uDest, vDest, dstW, chrDstW);
977
#endif //!HAVE_ALTIVEC
978
#endif /* HAVE_MMX */
979
}
980

    
981
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
984
{
985
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986
             chrFilter, chrSrc, chrFilterSize,
987
             dest, uDest, dstW, chrDstW, dstFormat);
988
}
989

    
990
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
992
{
993
#ifdef HAVE_MMX
994
    long p= uDest ? 3 : 1;
995
    uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996
    uint8_t *dst[3]= {dest, uDest, vDest};
997
    long counter[3] = {dstW, chrDstW, chrDstW};
998

    
999
    if (c->flags & SWS_ACCURATE_RND){
1000
        while(p--){
1001
            asm volatile(
1002
                YSCALEYUV2YV121_ACCURATE
1003
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1004
                "g" (-counter[p])
1005
                : "%"REG_a
1006
            );
1007
        }
1008
    }else{
1009
        while(p--){
1010
            asm volatile(
1011
                YSCALEYUV2YV121
1012
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1013
                "g" (-counter[p])
1014
                : "%"REG_a
1015
            );
1016
        }
1017
    }
1018

    
1019
#else
1020
    int i;
1021
    for (i=0; i<dstW; i++)
1022
    {
1023
        int val= (lumSrc[i]+64)>>7;
1024

    
1025
        if (val&256){
1026
            if (val<0) val=0;
1027
            else       val=255;
1028
        }
1029

    
1030
        dest[i]= val;
1031
    }
1032

    
1033
    if (uDest)
1034
        for (i=0; i<chrDstW; i++)
1035
        {
1036
            int u=(chrSrc[i       ]+64)>>7;
1037
            int v=(chrSrc[i + VOFW]+64)>>7;
1038

    
1039
            if ((u|v)&256){
1040
                if (u<0)        u=0;
1041
                else if (u>255) u=255;
1042
                if (v<0)        v=0;
1043
                else if (v>255) v=255;
1044
            }
1045

    
1046
            uDest[i]= u;
1047
            vDest[i]= v;
1048
        }
1049
#endif
1050
}
1051

    
1052

    
1053
/**
1054
 * vertical scale YV12 to RGB
1055
 */
1056
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058
                                       uint8_t *dest, long dstW, long dstY)
1059
{
1060
#ifdef HAVE_MMX
1061
    long dummy=0;
1062
    if (c->flags & SWS_ACCURATE_RND){
1063
        switch(c->dstFormat){
1064
        case PIX_FMT_RGB32:
1065
            YSCALEYUV2PACKEDX_ACCURATE
1066
            YSCALEYUV2RGBX
1067
            WRITEBGR32(%4, %5, %%REGa)
1068

    
1069
            YSCALEYUV2PACKEDX_END
1070
            return;
1071
        case PIX_FMT_BGR24:
1072
            YSCALEYUV2PACKEDX_ACCURATE
1073
            YSCALEYUV2RGBX
1074
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075
            "add %4, %%"REG_c"                        \n\t"
1076
            WRITEBGR24(%%REGc, %5, %%REGa)
1077

    
1078

    
1079
            :: "r" (&c->redDither),
1080
               "m" (dummy), "m" (dummy), "m" (dummy),
1081
               "r" (dest), "m" (dstW)
1082
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1083
            );
1084
            return;
1085
        case PIX_FMT_RGB555:
1086
            YSCALEYUV2PACKEDX_ACCURATE
1087
            YSCALEYUV2RGBX
1088
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089
#ifdef DITHER1XBPP
1090
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1093
#endif
1094

    
1095
            WRITERGB15(%4, %5, %%REGa)
1096
            YSCALEYUV2PACKEDX_END
1097
            return;
1098
        case PIX_FMT_RGB565:
1099
            YSCALEYUV2PACKEDX_ACCURATE
1100
            YSCALEYUV2RGBX
1101
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102
#ifdef DITHER1XBPP
1103
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1106
#endif
1107

    
1108
            WRITERGB16(%4, %5, %%REGa)
1109
            YSCALEYUV2PACKEDX_END
1110
            return;
1111
        case PIX_FMT_YUYV422:
1112
            YSCALEYUV2PACKEDX_ACCURATE
1113
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114

    
1115
            "psraw $3, %%mm3    \n\t"
1116
            "psraw $3, %%mm4    \n\t"
1117
            "psraw $3, %%mm1    \n\t"
1118
            "psraw $3, %%mm7    \n\t"
1119
            WRITEYUY2(%4, %5, %%REGa)
1120
            YSCALEYUV2PACKEDX_END
1121
            return;
1122
    }
1123
    }else{
1124
        switch(c->dstFormat)
1125
        {
1126
        case PIX_FMT_RGB32:
1127
            YSCALEYUV2PACKEDX
1128
            YSCALEYUV2RGBX
1129
            WRITEBGR32(%4, %5, %%REGa)
1130
            YSCALEYUV2PACKEDX_END
1131
            return;
1132
        case PIX_FMT_BGR24:
1133
            YSCALEYUV2PACKEDX
1134
            YSCALEYUV2RGBX
1135
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
            "add                        %4, %%"REG_c"   \n\t"
1137
            WRITEBGR24(%%REGc, %5, %%REGa)
1138

    
1139
            :: "r" (&c->redDither),
1140
               "m" (dummy), "m" (dummy), "m" (dummy),
1141
               "r" (dest),  "m" (dstW)
1142
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
            );
1144
            return;
1145
        case PIX_FMT_RGB555:
1146
            YSCALEYUV2PACKEDX
1147
            YSCALEYUV2RGBX
1148
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
#ifdef DITHER1XBPP
1150
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1151
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1152
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1153
#endif
1154

    
1155
            WRITERGB15(%4, %5, %%REGa)
1156
            YSCALEYUV2PACKEDX_END
1157
            return;
1158
        case PIX_FMT_RGB565:
1159
            YSCALEYUV2PACKEDX
1160
            YSCALEYUV2RGBX
1161
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1162
#ifdef DITHER1XBPP
1163
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1164
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1165
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1166
#endif
1167

    
1168
            WRITERGB16(%4, %5, %%REGa)
1169
            YSCALEYUV2PACKEDX_END
1170
            return;
1171
        case PIX_FMT_YUYV422:
1172
            YSCALEYUV2PACKEDX
1173
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174

    
1175
            "psraw $3, %%mm3    \n\t"
1176
            "psraw $3, %%mm4    \n\t"
1177
            "psraw $3, %%mm1    \n\t"
1178
            "psraw $3, %%mm7    \n\t"
1179
            WRITEYUY2(%4, %5, %%REGa)
1180
            YSCALEYUV2PACKEDX_END
1181
            return;
1182
        }
1183
    }
1184
#endif /* HAVE_MMX */
1185
#ifdef HAVE_ALTIVEC
1186
    /* The following list of supported dstFormat values should
1187
       match what's found in the body of altivec_yuv2packedX() */
1188
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1189
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1191
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192
                                 chrFilter, chrSrc, chrFilterSize,
1193
                                 dest, dstW, dstY);
1194
    else
1195
#endif
1196
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197
                       chrFilter, chrSrc, chrFilterSize,
1198
                       dest, dstW, dstY);
1199
}
1200

    
1201
/**
1202
 * vertical bilinear scale YV12 to RGB
1203
 */
1204
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1206
{
1207
    int  yalpha1=4095- yalpha;
1208
    int uvalpha1=4095-uvalpha;
1209
    int i;
1210

    
1211
#if 0 //isn't used
1212
    if (flags&SWS_FULL_CHR_H_INT)
1213
    {
1214
        switch(dstFormat)
1215
        {
1216
#ifdef HAVE_MMX
1217
        case PIX_FMT_RGB32:
1218
            asm volatile(
1219

1220

1221
FULL_YSCALEYUV2RGB
1222
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1223
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1224

1225
            "movq      %%mm3, %%mm1    \n\t"
1226
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1227
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1228

1229
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1230
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1231

1232
            "add $4, %%"REG_a"  \n\t"
1233
            "cmp %5, %%"REG_a"  \n\t"
1234
            " jb 1b             \n\t"
1235

1236
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237
            "m" (yalpha1), "m" (uvalpha1)
1238
            : "%"REG_a
1239
            );
1240
            break;
1241
        case PIX_FMT_BGR24:
1242
            asm volatile(
1243

1244
FULL_YSCALEYUV2RGB
1245

1246
                                              // lsb ... msb
1247
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1248
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1249

1250
            "movq      %%mm3, %%mm1     \n\t"
1251
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1252
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1253

1254
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1255
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1256
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1257
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1258
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1259
            "movq      %%mm1, %%mm2     \n\t"
1260
            "psllq       $48, %%mm1     \n\t" // 000000BG
1261
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1262

1263
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1264
            "psrld       $16, %%mm2     \n\t" // R000R000
1265
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1266
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1267

1268
            "mov          %4, %%"REG_b" \n\t"
1269
            "add   %%"REG_a", %%"REG_b" \n\t"
1270

1271
#ifdef HAVE_MMX2
1272
            //FIXME Alignment
1273
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1274
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1275
#else
1276
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1277
            "psrlq  $32, %%mm3                          \n\t"
1278
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1279
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1280
#endif
1281
            "add     $4, %%"REG_a"                      \n\t"
1282
            "cmp     %5, %%"REG_a"                      \n\t"
1283
            " jb     1b                                 \n\t"
1284

    
1285
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286
            "m" (yalpha1), "m" (uvalpha1)
1287
            : "%"REG_a, "%"REG_b
1288
            );
1289
            break;
1290
        case PIX_FMT_BGR555:
1291
            asm volatile(
1292

    
1293
FULL_YSCALEYUV2RGB
1294
#ifdef DITHER1XBPP
1295
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1296
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1297
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1298
#endif
1299
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1300
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1301
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1302

    
1303
            "psrlw                   $3, %%mm3  \n\t"
1304
            "psllw                   $2, %%mm1  \n\t"
1305
            "psllw                   $7, %%mm0  \n\t"
1306
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1307
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1308

    
1309
            "por                  %%mm3, %%mm1  \n\t"
1310
            "por                  %%mm1, %%mm0  \n\t"
1311

    
1312
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1313

    
1314
            "add $4, %%"REG_a"  \n\t"
1315
            "cmp %5, %%"REG_a"  \n\t"
1316
            " jb 1b             \n\t"
1317

    
1318
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319
            "m" (yalpha1), "m" (uvalpha1)
1320
            : "%"REG_a
1321
            );
1322
            break;
1323
        case PIX_FMT_BGR565:
1324
            asm volatile(
1325

    
1326
FULL_YSCALEYUV2RGB
1327
#ifdef DITHER1XBPP
1328
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1329
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1330
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1331
#endif
1332
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1333
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1334
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1335

    
1336
            "psrlw                   $3, %%mm3  \n\t"
1337
            "psllw                   $3, %%mm1  \n\t"
1338
            "psllw                   $8, %%mm0  \n\t"
1339
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1340
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1341

    
1342
            "por                  %%mm3, %%mm1  \n\t"
1343
            "por                  %%mm1, %%mm0  \n\t"
1344

    
1345
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1346

    
1347
            "add $4, %%"REG_a"  \n\t"
1348
            "cmp %5, %%"REG_a"  \n\t"
1349
            " jb 1b             \n\t"
1350

    
1351
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352
            "m" (yalpha1), "m" (uvalpha1)
1353
            : "%"REG_a
1354
            );
1355
            break;
1356
#endif /* HAVE_MMX */
1357
        case PIX_FMT_BGR32:
1358
#ifndef HAVE_MMX
1359
        case PIX_FMT_RGB32:
1360
#endif
1361
            if (dstFormat==PIX_FMT_RGB32)
1362
            {
1363
                int i;
1364
#ifdef WORDS_BIGENDIAN
1365
                dest++;
1366
#endif
1367
                for (i=0;i<dstW;i++){
1368
                    // vertical linear interpolation && yuv2rgb in a single step:
1369
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1375
                    dest+= 4;
1376
                }
1377
            }
1378
            else if (dstFormat==PIX_FMT_BGR24)
1379
            {
1380
                int i;
1381
                for (i=0;i<dstW;i++){
1382
                    // vertical linear interpolation && yuv2rgb in a single step:
1383
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1389
                    dest+= 3;
1390
                }
1391
            }
1392
            else if (dstFormat==PIX_FMT_BGR565)
1393
            {
1394
                int i;
1395
                for (i=0;i<dstW;i++){
1396
                    // vertical linear interpolation && yuv2rgb in a single step:
1397
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1400

    
1401
                    ((uint16_t*)dest)[i] =
1402
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1405
                }
1406
            }
1407
            else if (dstFormat==PIX_FMT_BGR555)
1408
            {
1409
                int i;
1410
                for (i=0;i<dstW;i++){
1411
                    // vertical linear interpolation && yuv2rgb in a single step:
1412
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1415

    
1416
                    ((uint16_t*)dest)[i] =
1417
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1420
                }
1421
            }
1422
        }//FULL_UV_IPOL
1423
    else
1424
    {
1425
#endif // if 0
1426
#ifdef HAVE_MMX
1427
        switch(c->dstFormat)
1428
        {
1429
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1430
            case PIX_FMT_RGB32:
1431
                asm volatile(
1432
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1433
                "mov        %4, %%"REG_b"               \n\t"
1434
                "push %%"REG_BP"                        \n\t"
1435
                YSCALEYUV2RGB(%%REGBP, %5)
1436
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437
                "pop %%"REG_BP"                         \n\t"
1438
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1439

    
1440
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441
                "a" (&c->redDither)
1442
                );
1443
                return;
1444
            case PIX_FMT_BGR24:
1445
                asm volatile(
1446
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1447
                "mov        %4, %%"REG_b"               \n\t"
1448
                "push %%"REG_BP"                        \n\t"
1449
                YSCALEYUV2RGB(%%REGBP, %5)
1450
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451
                "pop %%"REG_BP"                         \n\t"
1452
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1453
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1454
                "a" (&c->redDither)
1455
                );
1456
                return;
1457
            case PIX_FMT_RGB555:
1458
                asm volatile(
1459
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1460
                "mov        %4, %%"REG_b"               \n\t"
1461
                "push %%"REG_BP"                        \n\t"
1462
                YSCALEYUV2RGB(%%REGBP, %5)
1463
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464
#ifdef DITHER1XBPP
1465
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1466
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1467
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1468
#endif
1469

    
1470
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471
                "pop %%"REG_BP"                         \n\t"
1472
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473

    
1474
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                "a" (&c->redDither)
1476
                );
1477
                return;
1478
            case PIX_FMT_RGB565:
1479
                asm volatile(
1480
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1481
                "mov        %4, %%"REG_b"               \n\t"
1482
                "push %%"REG_BP"                        \n\t"
1483
                YSCALEYUV2RGB(%%REGBP, %5)
1484
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1485
#ifdef DITHER1XBPP
1486
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1487
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1488
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1489
#endif
1490

    
1491
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492
                "pop %%"REG_BP"                         \n\t"
1493
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1494
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495
                "a" (&c->redDither)
1496
                );
1497
                return;
1498
            case PIX_FMT_YUYV422:
1499
                asm volatile(
1500
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                "mov %4, %%"REG_b"                        \n\t"
1502
                "push %%"REG_BP"                        \n\t"
1503
                YSCALEYUV2PACKED(%%REGBP, %5)
1504
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505
                "pop %%"REG_BP"                         \n\t"
1506
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1507
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508
                "a" (&c->redDither)
1509
                );
1510
                return;
1511
            default: break;
1512
        }
1513
#endif //HAVE_MMX
1514
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONOBLACK2_C)
1515
}
1516

    
1517
/**
1518
 * YV12 to RGB without scaling or interpolating
1519
 */
1520
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1522
{
1523
    const int yalpha1=0;
1524
    int i;
1525

    
1526
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527
    const int yalpha= 4096; //FIXME ...
1528

    
1529
    if (flags&SWS_FULL_CHR_H_INT)
1530
    {
1531
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1532
        return;
1533
    }
1534

    
1535
#ifdef HAVE_MMX
1536
    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1537
    {
1538
        switch(dstFormat)
1539
        {
1540
        case PIX_FMT_RGB32:
1541
            asm volatile(
1542
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1543
            "mov        %4, %%"REG_b"               \n\t"
1544
            "push %%"REG_BP"                        \n\t"
1545
            YSCALEYUV2RGB1(%%REGBP, %5)
1546
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547
            "pop %%"REG_BP"                         \n\t"
1548
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1549

    
1550
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551
            "a" (&c->redDither)
1552
            );
1553
            return;
1554
        case PIX_FMT_BGR24:
1555
            asm volatile(
1556
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1557
            "mov        %4, %%"REG_b"               \n\t"
1558
            "push %%"REG_BP"                        \n\t"
1559
            YSCALEYUV2RGB1(%%REGBP, %5)
1560
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561
            "pop %%"REG_BP"                         \n\t"
1562
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1563

    
1564
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565
            "a" (&c->redDither)
1566
            );
1567
            return;
1568
        case PIX_FMT_RGB555:
1569
            asm volatile(
1570
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1571
            "mov        %4, %%"REG_b"               \n\t"
1572
            "push %%"REG_BP"                        \n\t"
1573
            YSCALEYUV2RGB1(%%REGBP, %5)
1574
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575
#ifdef DITHER1XBPP
1576
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1577
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1578
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1579
#endif
1580
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581
            "pop %%"REG_BP"                         \n\t"
1582
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1583

    
1584
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1585
            "a" (&c->redDither)
1586
            );
1587
            return;
1588
        case PIX_FMT_RGB565:
1589
            asm volatile(
1590
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1591
            "mov        %4, %%"REG_b"               \n\t"
1592
            "push %%"REG_BP"                        \n\t"
1593
            YSCALEYUV2RGB1(%%REGBP, %5)
1594
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1595
#ifdef DITHER1XBPP
1596
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1597
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1598
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1599
#endif
1600

    
1601
            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602
            "pop %%"REG_BP"                         \n\t"
1603
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1604

    
1605
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1606
            "a" (&c->redDither)
1607
            );
1608
            return;
1609
        case PIX_FMT_YUYV422:
1610
            asm volatile(
1611
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1612
            "mov        %4, %%"REG_b"               \n\t"
1613
            "push %%"REG_BP"                        \n\t"
1614
            YSCALEYUV2PACKED1(%%REGBP, %5)
1615
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616
            "pop %%"REG_BP"                         \n\t"
1617
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1618

    
1619
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620
            "a" (&c->redDither)
1621
            );
1622
            return;
1623
        }
1624
    }
1625
    else
1626
    {
1627
        switch(dstFormat)
1628
        {
1629
        case PIX_FMT_RGB32:
1630
            asm volatile(
1631
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1632
            "mov        %4, %%"REG_b"               \n\t"
1633
            "push %%"REG_BP"                        \n\t"
1634
            YSCALEYUV2RGB1b(%%REGBP, %5)
1635
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636
            "pop %%"REG_BP"                         \n\t"
1637
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1638

    
1639
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1640
            "a" (&c->redDither)
1641
            );
1642
            return;
1643
        case PIX_FMT_BGR24:
1644
            asm volatile(
1645
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1646
            "mov        %4, %%"REG_b"               \n\t"
1647
            "push %%"REG_BP"                        \n\t"
1648
            YSCALEYUV2RGB1b(%%REGBP, %5)
1649
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650
            "pop %%"REG_BP"                         \n\t"
1651
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1652

    
1653
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654
            "a" (&c->redDither)
1655
            );
1656
            return;
1657
        case PIX_FMT_RGB555:
1658
            asm volatile(
1659
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1660
            "mov        %4, %%"REG_b"               \n\t"
1661
            "push %%"REG_BP"                        \n\t"
1662
            YSCALEYUV2RGB1b(%%REGBP, %5)
1663
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664
#ifdef DITHER1XBPP
1665
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1666
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1667
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1668
#endif
1669
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670
            "pop %%"REG_BP"                         \n\t"
1671
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1672

    
1673
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1674
            "a" (&c->redDither)
1675
            );
1676
            return;
1677
        case PIX_FMT_RGB565:
1678
            asm volatile(
1679
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1680
            "mov        %4, %%"REG_b"               \n\t"
1681
            "push %%"REG_BP"                        \n\t"
1682
            YSCALEYUV2RGB1b(%%REGBP, %5)
1683
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1684
#ifdef DITHER1XBPP
1685
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1686
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1687
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1688
#endif
1689

    
1690
            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691
            "pop %%"REG_BP"                         \n\t"
1692
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1693

    
1694
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1695
            "a" (&c->redDither)
1696
            );
1697
            return;
1698
        case PIX_FMT_YUYV422:
1699
            asm volatile(
1700
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1701
            "mov        %4, %%"REG_b"               \n\t"
1702
            "push %%"REG_BP"                        \n\t"
1703
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1704
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705
            "pop %%"REG_BP"                         \n\t"
1706
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1707

    
1708
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1709
            "a" (&c->redDither)
1710
            );
1711
            return;
1712
        }
1713
    }
1714
#endif /* HAVE_MMX */
1715
    if (uvalpha < 2048)
1716
    {
1717
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONOBLACK2_C)
1718
    }else{
1719
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONOBLACK2_C)
1720
    }
1721
}
1722

    
1723
//FIXME yuy2* can read up to 7 samples too much
1724

    
1725
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1726
{
1727
#ifdef HAVE_MMX
1728
    asm volatile(
1729
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1730
    "mov                    %0, %%"REG_a"       \n\t"
1731
    "1:                                         \n\t"
1732
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1733
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1734
    "pand                %%mm2, %%mm0           \n\t"
1735
    "pand                %%mm2, %%mm1           \n\t"
1736
    "packuswb            %%mm1, %%mm0           \n\t"
1737
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1738
    "add                    $8, %%"REG_a"       \n\t"
1739
    " js                    1b                  \n\t"
1740
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1741
    : "%"REG_a
1742
    );
1743
#else
1744
    int i;
1745
    for (i=0; i<width; i++)
1746
        dst[i]= src[2*i];
1747
#endif
1748
}
1749

    
1750
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1751
{
1752
#ifdef HAVE_MMX
1753
    asm volatile(
1754
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1755
    "mov                    %0, %%"REG_a"       \n\t"
1756
    "1:                                         \n\t"
1757
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1758
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1759
    "psrlw                  $8, %%mm0           \n\t"
1760
    "psrlw                  $8, %%mm1           \n\t"
1761
    "packuswb            %%mm1, %%mm0           \n\t"
1762
    "movq                %%mm0, %%mm1           \n\t"
1763
    "psrlw                  $8, %%mm0           \n\t"
1764
    "pand                %%mm4, %%mm1           \n\t"
1765
    "packuswb            %%mm0, %%mm0           \n\t"
1766
    "packuswb            %%mm1, %%mm1           \n\t"
1767
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1768
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1769
    "add                    $4, %%"REG_a"       \n\t"
1770
    " js                    1b                  \n\t"
1771
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1772
    : "%"REG_a
1773
    );
1774
#else
1775
    int i;
1776
    for (i=0; i<width; i++)
1777
    {
1778
        dstU[i]= src1[4*i + 1];
1779
        dstV[i]= src1[4*i + 3];
1780
    }
1781
#endif
1782
    assert(src1 == src2);
1783
}
1784

    
1785
/* This is almost identical to the previous, end exists only because
1786
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1788
{
1789
#ifdef HAVE_MMX
1790
    asm volatile(
1791
    "mov                  %0, %%"REG_a"         \n\t"
1792
    "1:                                         \n\t"
1793
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1794
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1795
    "psrlw                $8, %%mm0             \n\t"
1796
    "psrlw                $8, %%mm1             \n\t"
1797
    "packuswb          %%mm1, %%mm0             \n\t"
1798
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1799
    "add                  $8, %%"REG_a"         \n\t"
1800
    " js                  1b                    \n\t"
1801
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1802
    : "%"REG_a
1803
    );
1804
#else
1805
    int i;
1806
    for (i=0; i<width; i++)
1807
        dst[i]= src[2*i+1];
1808
#endif
1809
}
1810

    
1811
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1812
{
1813
#ifdef HAVE_MMX
1814
    asm volatile(
1815
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1816
    "mov                    %0, %%"REG_a"       \n\t"
1817
    "1:                                         \n\t"
1818
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1819
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1820
    "pand                %%mm4, %%mm0           \n\t"
1821
    "pand                %%mm4, %%mm1           \n\t"
1822
    "packuswb            %%mm1, %%mm0           \n\t"
1823
    "movq                %%mm0, %%mm1           \n\t"
1824
    "psrlw                  $8, %%mm0           \n\t"
1825
    "pand                %%mm4, %%mm1           \n\t"
1826
    "packuswb            %%mm0, %%mm0           \n\t"
1827
    "packuswb            %%mm1, %%mm1           \n\t"
1828
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1829
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1830
    "add                    $4, %%"REG_a"       \n\t"
1831
    " js                    1b                  \n\t"
1832
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1833
    : "%"REG_a
1834
    );
1835
#else
1836
    int i;
1837
    for (i=0; i<width; i++)
1838
    {
1839
        dstU[i]= src1[4*i + 0];
1840
        dstV[i]= src1[4*i + 2];
1841
    }
1842
#endif
1843
    assert(src1 == src2);
1844
}
1845

    
1846
#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1847
static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width)\
1848
{\
1849
    int i;\
1850
    for (i=0; i<width; i++)\
1851
    {\
1852
        int b= (((type*)src)[i]>>shb)&maskb;\
1853
        int g= (((type*)src)[i]>>shg)&maskg;\
1854
        int r= (((type*)src)[i]>>shr)&maskr;\
1855
\
1856
        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1857
    }\
1858
}
1859

    
1860
BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1861
BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1862
BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1863
BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1864
BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1865
BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1866

    
1867
#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1868
static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1869
{\
1870
    int i;\
1871
    for (i=0; i<width; i++)\
1872
    {\
1873
        int b= (((type*)src)[i]&maskb)>>shb;\
1874
        int g= (((type*)src)[i]&maskg)>>shg;\
1875
        int r= (((type*)src)[i]&maskr)>>shr;\
1876
\
1877
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1878
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1879
    }\
1880
}\
1881
static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width)\
1882
{\
1883
    int i;\
1884
    for (i=0; i<width; i++)\
1885
    {\
1886
        int pix0= ((type*)src)[2*i+0];\
1887
        int pix1= ((type*)src)[2*i+1];\
1888
        int g= (pix0&maskg)+(pix1&maskg);\
1889
        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1890
        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1891
\
1892
        g>>=shg;\
1893
\
1894
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1895
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1896
    }\
1897
}
1898

    
1899
BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1900
BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1901
BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1902
BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1903
BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1904
BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1905

    
1906
#ifdef HAVE_MMX
1907
static inline void bgr24ToY_mmx(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1908
{
1909

    
1910
    if(srcFormat == PIX_FMT_BGR24){
1911
        asm volatile(
1912
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1913
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1914
            :
1915
        );
1916
    }else{
1917
        asm volatile(
1918
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1919
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1920
            :
1921
        );
1922
    }
1923

    
1924
    asm volatile(
1925
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1926
        "mov                        %2, %%"REG_a"   \n\t"
1927
        "pxor                    %%mm7, %%mm7       \n\t"
1928
        "1:                                         \n\t"
1929
        PREFETCH"               64(%0)              \n\t"
1930
        "movd                     (%0), %%mm0       \n\t"
1931
        "movd                    2(%0), %%mm1       \n\t"
1932
        "movd                    6(%0), %%mm2       \n\t"
1933
        "movd                    8(%0), %%mm3       \n\t"
1934
        "add                       $12, %0          \n\t"
1935
        "punpcklbw               %%mm7, %%mm0       \n\t"
1936
        "punpcklbw               %%mm7, %%mm1       \n\t"
1937
        "punpcklbw               %%mm7, %%mm2       \n\t"
1938
        "punpcklbw               %%mm7, %%mm3       \n\t"
1939
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1940
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1941
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1942
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1943
        "paddd                   %%mm1, %%mm0       \n\t"
1944
        "paddd                   %%mm3, %%mm2       \n\t"
1945
        "paddd                   %%mm4, %%mm0       \n\t"
1946
        "paddd                   %%mm4, %%mm2       \n\t"
1947
        "psrad                     $15, %%mm0       \n\t"
1948
        "psrad                     $15, %%mm2       \n\t"
1949
        "packssdw                %%mm2, %%mm0       \n\t"
1950
        "packuswb                %%mm0, %%mm0       \n\t"
1951
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1952
        "add                        $4, %%"REG_a"   \n\t"
1953
        " js                        1b              \n\t"
1954
    : "+r" (src)
1955
    : "r" (dst+width), "g" (-width)
1956
    : "%"REG_a
1957
    );
1958
}
1959

    
1960
static inline void bgr24ToUV_mmx(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1961
{
1962
    asm volatile(
1963
        "movq                    24+%4, %%mm6       \n\t"
1964
        "mov                        %3, %%"REG_a"   \n\t"
1965
        "pxor                    %%mm7, %%mm7       \n\t"
1966
        "1:                                         \n\t"
1967
        PREFETCH"               64(%0)              \n\t"
1968
        "movd                     (%0), %%mm0       \n\t"
1969
        "movd                    2(%0), %%mm1       \n\t"
1970
        "punpcklbw               %%mm7, %%mm0       \n\t"
1971
        "punpcklbw               %%mm7, %%mm1       \n\t"
1972
        "movq                    %%mm0, %%mm2       \n\t"
1973
        "movq                    %%mm1, %%mm3       \n\t"
1974
        "pmaddwd                    %4, %%mm0       \n\t"
1975
        "pmaddwd                  8+%4, %%mm1       \n\t"
1976
        "pmaddwd                 16+%4, %%mm2       \n\t"
1977
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1978
        "paddd                   %%mm1, %%mm0       \n\t"
1979
        "paddd                   %%mm3, %%mm2       \n\t"
1980

    
1981
        "movd                    6(%0), %%mm1       \n\t"
1982
        "movd                    8(%0), %%mm3       \n\t"
1983
        "add                       $12, %0          \n\t"
1984
        "punpcklbw               %%mm7, %%mm1       \n\t"
1985
        "punpcklbw               %%mm7, %%mm3       \n\t"
1986
        "movq                    %%mm1, %%mm4       \n\t"
1987
        "movq                    %%mm3, %%mm5       \n\t"
1988
        "pmaddwd                    %4, %%mm1       \n\t"
1989
        "pmaddwd                  8+%4, %%mm3       \n\t"
1990
        "pmaddwd                 16+%4, %%mm4       \n\t"
1991
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1992
        "paddd                   %%mm3, %%mm1       \n\t"
1993
        "paddd                   %%mm5, %%mm4       \n\t"
1994

    
1995
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1996
        "paddd                   %%mm3, %%mm0       \n\t"
1997
        "paddd                   %%mm3, %%mm2       \n\t"
1998
        "paddd                   %%mm3, %%mm1       \n\t"
1999
        "paddd                   %%mm3, %%mm4       \n\t"
2000
        "psrad                     $15, %%mm0       \n\t"
2001
        "psrad                     $15, %%mm2       \n\t"
2002
        "psrad                     $15, %%mm1       \n\t"
2003
        "psrad                     $15, %%mm4       \n\t"
2004
        "packssdw                %%mm1, %%mm0       \n\t"
2005
        "packssdw                %%mm4, %%mm2       \n\t"
2006
        "packuswb                %%mm0, %%mm0       \n\t"
2007
        "packuswb                %%mm2, %%mm2       \n\t"
2008
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
2009
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
2010
        "add                        $4, %%"REG_a"   \n\t"
2011
        " js                        1b              \n\t"
2012
    : "+r" (src)
2013
    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2014
    : "%"REG_a
2015
    );
2016
}
2017
#endif
2018

    
2019
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
2020
{
2021
#ifdef HAVE_MMX
2022
    bgr24ToY_mmx(dst, src, width, PIX_FMT_BGR24);
2023
#else
2024
    int i;
2025
    for (i=0; i<width; i++)
2026
    {
2027
        int b= src[i*3+0];
2028
        int g= src[i*3+1];
2029
        int r= src[i*3+2];
2030

    
2031
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2032
    }
2033
#endif /* HAVE_MMX */
2034
}
2035

    
2036
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2037
{
2038
#ifdef HAVE_MMX
2039
    bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_BGR24);
2040
#else
2041
    int i;
2042
    for (i=0; i<width; i++)
2043
    {
2044
        int b= src1[3*i + 0];
2045
        int g= src1[3*i + 1];
2046
        int r= src1[3*i + 2];
2047

    
2048
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2049
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2050
    }
2051
#endif /* HAVE_MMX */
2052
    assert(src1 == src2);
2053
}
2054

    
2055
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2056
{
2057
    int i;
2058
    for (i=0; i<width; i++)
2059
    {
2060
        int b= src1[6*i + 0] + src1[6*i + 3];
2061
        int g= src1[6*i + 1] + src1[6*i + 4];
2062
        int r= src1[6*i + 2] + src1[6*i + 5];
2063

    
2064
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2065
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2066
    }
2067
    assert(src1 == src2);
2068
}
2069

    
2070
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width)
2071
{
2072
#ifdef HAVE_MMX
2073
    bgr24ToY_mmx(dst, src, width, PIX_FMT_RGB24);
2074
#else
2075
    int i;
2076
    for (i=0; i<width; i++)
2077
    {
2078
        int r= src[i*3+0];
2079
        int g= src[i*3+1];
2080
        int b= src[i*3+2];
2081

    
2082
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2083
    }
2084
#endif
2085
}
2086

    
2087
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2088
{
2089
    int i;
2090
    assert(src1==src2);
2091
#ifdef HAVE_MMX
2092
    bgr24ToUV_mmx(dstU, dstV, src1, width, PIX_FMT_RGB24);
2093
#else
2094
    for (i=0; i<width; i++)
2095
    {
2096
        int r= src1[3*i + 0];
2097
        int g= src1[3*i + 1];
2098
        int b= src1[3*i + 2];
2099

    
2100
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2101
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2102
    }
2103
#endif
2104
}
2105

    
2106
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
2107
{
2108
    int i;
2109
    assert(src1==src2);
2110
    for (i=0; i<width; i++)
2111
    {
2112
        int r= src1[6*i + 0] + src1[6*i + 3];
2113
        int g= src1[6*i + 1] + src1[6*i + 4];
2114
        int b= src1[6*i + 2] + src1[6*i + 5];
2115

    
2116
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2117
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2118
    }
2119
}
2120

    
2121

    
2122
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2123
{
2124
    int i;
2125
    for (i=0; i<width; i++)
2126
    {
2127
        int d= src[i];
2128

    
2129
        dst[i]= pal[d] & 0xFF;
2130
    }
2131
}
2132

    
2133
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2134
{
2135
    int i;
2136
    assert(src1 == src2);
2137
    for (i=0; i<width; i++)
2138
    {
2139
        int p= pal[src1[i]];
2140

    
2141
        dstU[i]= p>>8;
2142
        dstV[i]= p>>16;
2143
    }
2144
}
2145

    
2146
// bilinear / bicubic scaling
2147
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2148
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2149
{
2150
#ifdef HAVE_MMX
2151
    assert(filterSize % 4 == 0 && filterSize>0);
2152
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2153
    {
2154
        long counter= -2*dstW;
2155
        filter-= counter*2;
2156
        filterPos-= counter/2;
2157
        dst-= counter/2;
2158
        asm volatile(
2159
#if defined(PIC)
2160
        "push            %%"REG_b"              \n\t"
2161
#endif
2162
        "pxor                %%mm7, %%mm7       \n\t"
2163
        "movq        "MANGLE(w02)", %%mm6       \n\t"
2164
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2165
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2166
        ASMALIGN(4)
2167
        "1:                                     \n\t"
2168
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2169
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2170
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2171
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2172
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2173
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2174
        "punpcklbw           %%mm7, %%mm0       \n\t"
2175
        "punpcklbw           %%mm7, %%mm2       \n\t"
2176
        "pmaddwd             %%mm1, %%mm0       \n\t"
2177
        "pmaddwd             %%mm2, %%mm3       \n\t"
2178
        "psrad                  $8, %%mm0       \n\t"
2179
        "psrad                  $8, %%mm3       \n\t"
2180
        "packssdw            %%mm3, %%mm0       \n\t"
2181
        "pmaddwd             %%mm6, %%mm0       \n\t"
2182
        "packssdw            %%mm0, %%mm0       \n\t"
2183
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2184
        "add                    $4, %%"REG_BP"  \n\t"
2185
        " jnc                   1b              \n\t"
2186

    
2187
        "pop            %%"REG_BP"              \n\t"
2188
#if defined(PIC)
2189
        "pop             %%"REG_b"              \n\t"
2190
#endif
2191
        : "+a" (counter)
2192
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2193
#if !defined(PIC)
2194
        : "%"REG_b
2195
#endif
2196
        );
2197
    }
2198
    else if (filterSize==8)
2199
    {
2200
        long counter= -2*dstW;
2201
        filter-= counter*4;
2202
        filterPos-= counter/2;
2203
        dst-= counter/2;
2204
        asm volatile(
2205
#if defined(PIC)
2206
        "push             %%"REG_b"             \n\t"
2207
#endif
2208
        "pxor                 %%mm7, %%mm7      \n\t"
2209
        "movq         "MANGLE(w02)", %%mm6      \n\t"
2210
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2211
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2212
        ASMALIGN(4)
2213
        "1:                                     \n\t"
2214
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2215
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2216
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2217
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2218
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2219
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2220
        "punpcklbw            %%mm7, %%mm0      \n\t"
2221
        "punpcklbw            %%mm7, %%mm2      \n\t"
2222
        "pmaddwd              %%mm1, %%mm0      \n\t"
2223
        "pmaddwd              %%mm2, %%mm3      \n\t"
2224

    
2225
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2226
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2227
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2228
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2229
        "punpcklbw            %%mm7, %%mm4      \n\t"
2230
        "punpcklbw            %%mm7, %%mm2      \n\t"
2231
        "pmaddwd              %%mm1, %%mm4      \n\t"
2232
        "pmaddwd              %%mm2, %%mm5      \n\t"
2233
        "paddd                %%mm4, %%mm0      \n\t"
2234
        "paddd                %%mm5, %%mm3      \n\t"
2235

    
2236
        "psrad                   $8, %%mm0      \n\t"
2237
        "psrad                   $8, %%mm3      \n\t"
2238
        "packssdw             %%mm3, %%mm0      \n\t"
2239
        "pmaddwd              %%mm6, %%mm0      \n\t"
2240
        "packssdw             %%mm0, %%mm0      \n\t"
2241
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2242
        "add                     $4, %%"REG_BP" \n\t"
2243
        " jnc                    1b             \n\t"
2244

    
2245
        "pop             %%"REG_BP"             \n\t"
2246
#if defined(PIC)
2247
        "pop              %%"REG_b"             \n\t"
2248
#endif
2249
        : "+a" (counter)
2250
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2251
#if !defined(PIC)
2252
        : "%"REG_b
2253
#endif
2254
        );
2255
    }
2256
    else
2257
    {
2258
        uint8_t *offset = src+filterSize;
2259
        long counter= -2*dstW;
2260
        //filter-= counter*filterSize/2;
2261
        filterPos-= counter/2;
2262
        dst-= counter/2;
2263
        asm volatile(
2264
        "pxor                  %%mm7, %%mm7     \n\t"
2265
        "movq          "MANGLE(w02)", %%mm6     \n\t"
2266
        ASMALIGN(4)
2267
        "1:                                     \n\t"
2268
        "mov                      %2, %%"REG_c" \n\t"
2269
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2270
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2271
        "mov                      %5, %%"REG_c" \n\t"
2272
        "pxor                  %%mm4, %%mm4     \n\t"
2273
        "pxor                  %%mm5, %%mm5     \n\t"
2274
        "2:                                     \n\t"
2275
        "movq                   (%1), %%mm1     \n\t"
2276
        "movq               (%1, %6), %%mm3     \n\t"
2277
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2278
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2279
        "punpcklbw             %%mm7, %%mm0     \n\t"
2280
        "punpcklbw             %%mm7, %%mm2     \n\t"
2281
        "pmaddwd               %%mm1, %%mm0     \n\t"
2282
        "pmaddwd               %%mm2, %%mm3     \n\t"
2283
        "paddd                 %%mm3, %%mm5     \n\t"
2284
        "paddd                 %%mm0, %%mm4     \n\t"
2285
        "add                      $8, %1        \n\t"
2286
        "add                      $4, %%"REG_c" \n\t"
2287
        "cmp                      %4, %%"REG_c" \n\t"
2288
        " jb                      2b            \n\t"
2289
        "add                      %6, %1        \n\t"
2290
        "psrad                    $8, %%mm4     \n\t"
2291
        "psrad                    $8, %%mm5     \n\t"
2292
        "packssdw              %%mm5, %%mm4     \n\t"
2293
        "pmaddwd               %%mm6, %%mm4     \n\t"
2294
        "packssdw              %%mm4, %%mm4     \n\t"
2295
        "mov                      %3, %%"REG_a" \n\t"
2296
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2297
        "add                      $4, %0        \n\t"
2298
        " jnc                     1b            \n\t"
2299

    
2300
        : "+r" (counter), "+r" (filter)
2301
        : "m" (filterPos), "m" (dst), "m"(offset),
2302
          "m" (src), "r" (filterSize*2)
2303
        : "%"REG_a, "%"REG_c, "%"REG_d
2304
        );
2305
    }
2306
#else
2307
#ifdef HAVE_ALTIVEC
2308
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2309
#else
2310
    int i;
2311
    for (i=0; i<dstW; i++)
2312
    {
2313
        int j;
2314
        int srcPos= filterPos[i];
2315
        int val=0;
2316
        //printf("filterPos: %d\n", filterPos[i]);
2317
        for (j=0; j<filterSize; j++)
2318
        {
2319
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2320
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2321
        }
2322
        //filter += hFilterSize;
2323
        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2324
        //dst[i] = val>>7;
2325
    }
2326
#endif /* HAVE_ALTIVEC */
2327
#endif /* HAVE_MMX */
2328
}
2329
      // *** horizontal scale Y line to temp buffer
2330
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2331
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2332
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2333
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2334
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2335
{
2336
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2337
    {
2338
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2339
        src= formatConvBuffer;
2340
    }
2341
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2342
    {
2343
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2344
        src= formatConvBuffer;
2345
    }
2346
    else if (srcFormat==PIX_FMT_RGB32)
2347
    {
2348
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2349
        src= formatConvBuffer;
2350
    }
2351
    else if (srcFormat==PIX_FMT_RGB32_1)
2352
    {
2353
        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2354
        src= formatConvBuffer;
2355
    }
2356
    else if (srcFormat==PIX_FMT_BGR24)
2357
    {
2358
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2359
        src= formatConvBuffer;
2360
    }
2361
    else if (srcFormat==PIX_FMT_BGR565)
2362
    {
2363
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2364
        src= formatConvBuffer;
2365
    }
2366
    else if (srcFormat==PIX_FMT_BGR555)
2367
    {
2368
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2369
        src= formatConvBuffer;
2370
    }
2371
    else if (srcFormat==PIX_FMT_BGR32)
2372
    {
2373
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2374
        src= formatConvBuffer;
2375
    }
2376
    else if (srcFormat==PIX_FMT_BGR32_1)
2377
    {
2378
        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2379
        src= formatConvBuffer;
2380
    }
2381
    else if (srcFormat==PIX_FMT_RGB24)
2382
    {
2383
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2384
        src= formatConvBuffer;
2385
    }
2386
    else if (srcFormat==PIX_FMT_RGB565)
2387
    {
2388
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2389
        src= formatConvBuffer;
2390
    }
2391
    else if (srcFormat==PIX_FMT_RGB555)
2392
    {
2393
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2394
        src= formatConvBuffer;
2395
    }
2396
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2397
    {
2398
        RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2399
        src= formatConvBuffer;
2400
    }
2401

    
2402
#ifdef HAVE_MMX
2403
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2404
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2405
#else
2406
    if (!(flags&SWS_FAST_BILINEAR))
2407
#endif
2408
    {
2409
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2410
    }
2411
    else // fast bilinear upscale / crap downscale
2412
    {
2413
#if defined(ARCH_X86)
2414
#ifdef HAVE_MMX2
2415
        int i;
2416
#if defined(PIC)
2417
        uint64_t ebxsave __attribute__((aligned(8)));
2418
#endif
2419
        if (canMMX2BeUsed)
2420
        {
2421
            asm volatile(
2422
#if defined(PIC)
2423
            "mov               %%"REG_b", %5        \n\t"
2424
#endif
2425
            "pxor                  %%mm7, %%mm7     \n\t"
2426
            "mov                      %0, %%"REG_c" \n\t"
2427
            "mov                      %1, %%"REG_D" \n\t"
2428
            "mov                      %2, %%"REG_d" \n\t"
2429
            "mov                      %3, %%"REG_b" \n\t"
2430
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2431
            PREFETCH"        (%%"REG_c")            \n\t"
2432
            PREFETCH"      32(%%"REG_c")            \n\t"
2433
            PREFETCH"      64(%%"REG_c")            \n\t"
2434

    
2435
#ifdef ARCH_X86_64
2436

    
2437
#define FUNNY_Y_CODE \
2438
            "movl            (%%"REG_b"), %%esi     \n\t"\
2439
            "call                    *%4            \n\t"\
2440
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2441
            "add               %%"REG_S", %%"REG_c" \n\t"\
2442
            "add               %%"REG_a", %%"REG_D" \n\t"\
2443
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2444

    
2445
#else
2446

    
2447
#define FUNNY_Y_CODE \
2448
            "movl (%%"REG_b"), %%esi        \n\t"\
2449
            "call         *%4                       \n\t"\
2450
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2451
            "add               %%"REG_a", %%"REG_D" \n\t"\
2452
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2453

    
2454
#endif /* ARCH_X86_64 */
2455

    
2456
FUNNY_Y_CODE
2457
FUNNY_Y_CODE
2458
FUNNY_Y_CODE
2459
FUNNY_Y_CODE
2460
FUNNY_Y_CODE
2461
FUNNY_Y_CODE
2462
FUNNY_Y_CODE
2463
FUNNY_Y_CODE
2464

    
2465
#if defined(PIC)
2466
            "mov                      %5, %%"REG_b" \n\t"
2467
#endif
2468
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2469
            "m" (funnyYCode)
2470
#if defined(PIC)
2471
            ,"m" (ebxsave)
2472
#endif
2473
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2474
#if !defined(PIC)
2475
            ,"%"REG_b
2476
#endif
2477
            );
2478
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2479
        }
2480
        else
2481
        {
2482
#endif /* HAVE_MMX2 */
2483
        long xInc_shr16 = xInc >> 16;
2484
        uint16_t xInc_mask = xInc & 0xffff;
2485
        //NO MMX just normal asm ...
2486
        asm volatile(
2487
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2488
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2489
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2490
        ASMALIGN(4)
2491
        "1:                                  \n\t"
2492
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2493
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2494
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2495
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2496
        "shll      $16, %%edi                \n\t"
2497
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2498
        "mov        %1, %%"REG_D"            \n\t"
2499
        "shrl       $9, %%esi                \n\t"
2500
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2501
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2502
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2503

    
2504
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2505
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2506
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2507
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2508
        "shll      $16, %%edi                \n\t"
2509
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2510
        "mov        %1, %%"REG_D"            \n\t"
2511
        "shrl       $9, %%esi                \n\t"
2512
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2513
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2514
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2515

    
2516

    
2517
        "add        $2, %%"REG_a"            \n\t"
2518
        "cmp        %2, %%"REG_a"            \n\t"
2519
        " jb        1b                       \n\t"
2520

    
2521

    
2522
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2523
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2524
        );
2525
#ifdef HAVE_MMX2
2526
        } //if MMX2 can't be used
2527
#endif
2528
#else
2529
        int i;
2530
        unsigned int xpos=0;
2531
        for (i=0;i<dstWidth;i++)
2532
        {
2533
            register unsigned int xx=xpos>>16;
2534
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2535
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2536
            xpos+=xInc;
2537
        }
2538
#endif /* defined(ARCH_X86) */
2539
    }
2540

    
2541
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2542
        int i;
2543
        //FIXME all pal and rgb srcFormats could do this convertion as well
2544
        //FIXME all scalers more complex than bilinear could do half of this transform
2545
        if(c->srcRange){
2546
            for (i=0; i<dstWidth; i++)
2547
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2548
        }else{
2549
            for (i=0; i<dstWidth; i++)
2550
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2551
        }
2552
    }
2553
}
2554

    
2555
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2556
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2557
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2558
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2559
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2560
{
2561
    if (srcFormat==PIX_FMT_YUYV422)
2562
    {
2563
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2564
        src1= formatConvBuffer;
2565
        src2= formatConvBuffer+VOFW;
2566
    }
2567
    else if (srcFormat==PIX_FMT_UYVY422)
2568
    {
2569
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2570
        src1= formatConvBuffer;
2571
        src2= formatConvBuffer+VOFW;
2572
    }
2573
    else if (srcFormat==PIX_FMT_RGB32)
2574
    {
2575
        if(c->chrSrcHSubSample)
2576
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2577
        else
2578
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2579
        src1= formatConvBuffer;
2580
        src2= formatConvBuffer+VOFW;
2581
    }
2582
    else if (srcFormat==PIX_FMT_RGB32_1)
2583
    {
2584
        if(c->chrSrcHSubSample)
2585
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2586
        else
2587
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2588
        src1= formatConvBuffer;
2589
        src2= formatConvBuffer+VOFW;
2590
    }
2591
    else if (srcFormat==PIX_FMT_BGR24)
2592
    {
2593
        if(c->chrSrcHSubSample)
2594
            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2595
        else
2596
            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2597
        src1= formatConvBuffer;
2598
        src2= formatConvBuffer+VOFW;
2599
    }
2600
    else if (srcFormat==PIX_FMT_BGR565)
2601
    {
2602
        if(c->chrSrcHSubSample)
2603
            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2604
        else
2605
            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2606
        src1= formatConvBuffer;
2607
        src2= formatConvBuffer+VOFW;
2608
    }
2609
    else if (srcFormat==PIX_FMT_BGR555)
2610
    {
2611
        if(c->chrSrcHSubSample)
2612
            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2613
        else
2614
            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2615
        src1= formatConvBuffer;
2616
        src2= formatConvBuffer+VOFW;
2617
    }
2618
    else if (srcFormat==PIX_FMT_BGR32)
2619
    {
2620
        if(c->chrSrcHSubSample)
2621
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2622
        else
2623
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2624
        src1= formatConvBuffer;
2625
        src2= formatConvBuffer+VOFW;
2626
    }
2627
    else if (srcFormat==PIX_FMT_BGR32_1)
2628
    {
2629
        if(c->chrSrcHSubSample)
2630
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2631
        else
2632
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2633
        src1= formatConvBuffer;
2634
        src2= formatConvBuffer+VOFW;
2635
    }
2636
    else if (srcFormat==PIX_FMT_RGB24)
2637
    {
2638
        if(c->chrSrcHSubSample)
2639
            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2640
        else
2641
            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2642
        src1= formatConvBuffer;
2643
        src2= formatConvBuffer+VOFW;
2644
    }
2645
    else if (srcFormat==PIX_FMT_RGB565)
2646
    {
2647
        if(c->chrSrcHSubSample)
2648
            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2649
        else
2650
            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2651
        src1= formatConvBuffer;
2652
        src2= formatConvBuffer+VOFW;
2653
    }
2654
    else if (srcFormat==PIX_FMT_RGB555)
2655
    {
2656
        if(c->chrSrcHSubSample)
2657
            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2658
        else
2659
            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2660
        src1= formatConvBuffer;
2661
        src2= formatConvBuffer+VOFW;
2662
    }
2663
    else if (isGray(srcFormat))
2664
    {
2665
        return;
2666
    }
2667
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2668
    {
2669
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2670
        src1= formatConvBuffer;
2671
        src2= formatConvBuffer+VOFW;
2672
    }
2673

    
2674
#ifdef HAVE_MMX
2675
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2676
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2677
#else
2678
    if (!(flags&SWS_FAST_BILINEAR))
2679
#endif
2680
    {
2681
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2682
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2683
    }
2684
    else // fast bilinear upscale / crap downscale
2685
    {
2686
#if defined(ARCH_X86)
2687
#ifdef HAVE_MMX2
2688
        int i;
2689
#if defined(PIC)
2690
        uint64_t ebxsave __attribute__((aligned(8)));
2691
#endif
2692
        if (canMMX2BeUsed)
2693
        {
2694
            asm volatile(
2695
#if defined(PIC)
2696
            "mov          %%"REG_b", %6         \n\t"
2697
#endif
2698
            "pxor             %%mm7, %%mm7      \n\t"
2699
            "mov                 %0, %%"REG_c"  \n\t"
2700
            "mov                 %1, %%"REG_D"  \n\t"
2701
            "mov                 %2, %%"REG_d"  \n\t"
2702
            "mov                 %3, %%"REG_b"  \n\t"
2703
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2704
            PREFETCH"   (%%"REG_c")             \n\t"
2705
            PREFETCH" 32(%%"REG_c")             \n\t"
2706
            PREFETCH" 64(%%"REG_c")             \n\t"
2707

    
2708
#ifdef ARCH_X86_64
2709

    
2710
#define FUNNY_UV_CODE \
2711
            "movl       (%%"REG_b"), %%esi      \n\t"\
2712
            "call               *%4             \n\t"\
2713
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2714
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2715
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2716
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2717

    
2718
#else
2719

    
2720
#define FUNNY_UV_CODE \
2721
            "movl       (%%"REG_b"), %%esi      \n\t"\
2722
            "call               *%4             \n\t"\
2723
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2724
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2725
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2726

    
2727
#endif /* ARCH_X86_64 */
2728

    
2729
FUNNY_UV_CODE
2730
FUNNY_UV_CODE
2731
FUNNY_UV_CODE
2732
FUNNY_UV_CODE
2733
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2734
            "mov                 %5, %%"REG_c"  \n\t" // src
2735
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2736
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2737
            PREFETCH"   (%%"REG_c")             \n\t"
2738
            PREFETCH" 32(%%"REG_c")             \n\t"
2739
            PREFETCH" 64(%%"REG_c")             \n\t"
2740

    
2741
FUNNY_UV_CODE
2742
FUNNY_UV_CODE
2743
FUNNY_UV_CODE
2744
FUNNY_UV_CODE
2745

    
2746
#if defined(PIC)
2747
            "mov %6, %%"REG_b"    \n\t"
2748
#endif
2749
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2750
            "m" (funnyUVCode), "m" (src2)
2751
#if defined(PIC)
2752
            ,"m" (ebxsave)
2753
#endif
2754
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2755
#if !defined(PIC)
2756
             ,"%"REG_b
2757
#endif
2758
            );
2759
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2760
            {
2761
                //printf("%d %d %d\n", dstWidth, i, srcW);
2762
                dst[i] = src1[srcW-1]*128;
2763
                dst[i+VOFW] = src2[srcW-1]*128;
2764
            }
2765
        }
2766
        else
2767
        {
2768
#endif /* HAVE_MMX2 */
2769
            long xInc_shr16 = (long) (xInc >> 16);
2770
            uint16_t xInc_mask = xInc & 0xffff;
2771
            asm volatile(
2772
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2773
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2774
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2775
            ASMALIGN(4)
2776
            "1:                                     \n\t"
2777
            "mov        %0, %%"REG_S"               \n\t"
2778
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2779
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2780
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2781
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2782
            "shll      $16, %%edi                   \n\t"
2783
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2784
            "mov        %1, %%"REG_D"               \n\t"
2785
            "shrl       $9, %%esi                   \n\t"
2786
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2787

    
2788
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2789
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2790
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2791
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2792
            "shll      $16, %%edi                   \n\t"
2793
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2794
            "mov        %1, %%"REG_D"               \n\t"
2795
            "shrl       $9, %%esi                   \n\t"
2796
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2797

    
2798
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2799
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2800
            "add        $1, %%"REG_a"               \n\t"
2801
            "cmp        %2, %%"REG_a"               \n\t"
2802
            " jb        1b                          \n\t"
2803

    
2804
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2805
   which is needed to support GCC 4.0. */
2806
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2807
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2808
#else
2809
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2810
#endif
2811
            "r" (src2)
2812
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2813
            );
2814
#ifdef HAVE_MMX2
2815
        } //if MMX2 can't be used
2816
#endif
2817
#else
2818
        int i;
2819
        unsigned int xpos=0;
2820
        for (i=0;i<dstWidth;i++)
2821
        {
2822
            register unsigned int xx=xpos>>16;
2823
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2824
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2825
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2826
            /* slower
2827
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2828
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2829
            */
2830
            xpos+=xInc;
2831
        }
2832
#endif /* defined(ARCH_X86) */
2833
    }
2834
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2835
        int i;
2836
        //FIXME all pal and rgb srcFormats could do this convertion as well
2837
        //FIXME all scalers more complex than bilinear could do half of this transform
2838
        if(c->srcRange){
2839
            for (i=0; i<dstWidth; i++){
2840
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2841
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2842
            }
2843
        }else{
2844
            for (i=0; i<dstWidth; i++){
2845
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2846
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2847
            }
2848
        }
2849
    }
2850
}
2851

    
2852
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2853
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2854

    
2855
    /* load a few things into local vars to make the code more readable? and faster */
2856
    const int srcW= c->srcW;
2857
    const int dstW= c->dstW;
2858
    const int dstH= c->dstH;
2859
    const int chrDstW= c->chrDstW;
2860
    const int chrSrcW= c->chrSrcW;
2861
    const int lumXInc= c->lumXInc;
2862
    const int chrXInc= c->chrXInc;
2863
    const int dstFormat= c->dstFormat;
2864
    const int srcFormat= c->srcFormat;
2865
    const int flags= c->flags;
2866
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2867
    int16_t *vLumFilterPos= c->vLumFilterPos;
2868
    int16_t *vChrFilterPos= c->vChrFilterPos;
2869
    int16_t *hLumFilterPos= c->hLumFilterPos;
2870
    int16_t *hChrFilterPos= c->hChrFilterPos;
2871
    int16_t *vLumFilter= c->vLumFilter;
2872
    int16_t *vChrFilter= c->vChrFilter;
2873
    int16_t *hLumFilter= c->hLumFilter;
2874
    int16_t *hChrFilter= c->hChrFilter;
2875
    int32_t *lumMmxFilter= c->lumMmxFilter;
2876
    int32_t *chrMmxFilter= c->chrMmxFilter;
2877
    const int vLumFilterSize= c->vLumFilterSize;
2878
    const int vChrFilterSize= c->vChrFilterSize;
2879
    const int hLumFilterSize= c->hLumFilterSize;
2880
    const int hChrFilterSize= c->hChrFilterSize;
2881
    int16_t **lumPixBuf= c->lumPixBuf;
2882
    int16_t **chrPixBuf= c->chrPixBuf;
2883
    const int vLumBufSize= c->vLumBufSize;
2884
    const int vChrBufSize= c->vChrBufSize;
2885
    uint8_t *funnyYCode= c->funnyYCode;
2886
    uint8_t *funnyUVCode= c->funnyUVCode;
2887
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2888
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2889
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2890
    int lastDstY;
2891
    uint8_t *pal=NULL;
2892

    
2893
    /* vars which will change and which we need to store back in the context */
2894
    int dstY= c->dstY;
2895
    int lumBufIndex= c->lumBufIndex;
2896
    int chrBufIndex= c->chrBufIndex;
2897
    int lastInLumBuf= c->lastInLumBuf;
2898
    int lastInChrBuf= c->lastInChrBuf;
2899

    
2900
    if (isPacked(c->srcFormat)){
2901
        pal= src[1];
2902
        src[0]=
2903
        src[1]=
2904
        src[2]= src[0];
2905
        srcStride[0]=
2906
        srcStride[1]=
2907
        srcStride[2]= srcStride[0];
2908
    }
2909
    srcStride[1]<<= c->vChrDrop;
2910
    srcStride[2]<<= c->vChrDrop;
2911

    
2912
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2913
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2914

    
2915
#if 0 //self test FIXME move to a vfilter or something
2916
    {
2917
    static volatile int i=0;
2918
    i++;
2919
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2920
        selfTest(src, srcStride, c->srcW, c->srcH);
2921
    i--;
2922
    }
2923
#endif
2924

    
2925
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2926
    //dstStride[0],dstStride[1],dstStride[2]);
2927

    
2928
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2929
    {
2930
        static int firstTime=1; //FIXME move this into the context perhaps
2931
        if (flags & SWS_PRINT_INFO && firstTime)
2932
        {
2933
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2934
                   "         ->cannot do aligned memory accesses anymore\n");
2935
            firstTime=0;
2936
        }
2937
    }
2938

    
2939
    /* Note the user might start scaling the picture in the middle so this
2940
       will not get executed. This is not really intended but works
2941
       currently, so people might do it. */
2942
    if (srcSliceY ==0){
2943
        lumBufIndex=0;
2944
        chrBufIndex=0;
2945
        dstY=0;
2946
        lastInLumBuf= -1;
2947
        lastInChrBuf= -1;
2948
    }
2949

    
2950
    lastDstY= dstY;
2951

    
2952
    for (;dstY < dstH; dstY++){
2953
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2954
        const int chrDstY= dstY>>c->chrDstVSubSample;
2955
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2956
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2957

    
2958
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2959
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2960
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2961
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2962

    
2963
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2964
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2965
        //handle holes (FAST_BILINEAR & weird filters)
2966
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2967
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2968
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2969
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2970
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2971

    
2972
        // Do we have enough lines in this slice to output the dstY line
2973
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2974
        {
2975
            //Do horizontal scaling
2976
            while(lastInLumBuf < lastLumSrcY)
2977
            {
2978
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2979
                lumBufIndex++;
2980
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2981
                assert(lumBufIndex < 2*vLumBufSize);
2982
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2983
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2984
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
2985
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2986
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2987
                                funnyYCode, c->srcFormat, formatConvBuffer,
2988
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2989
                lastInLumBuf++;
2990
            }
2991
            while(lastInChrBuf < lastChrSrcY)
2992
            {
2993
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2994
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2995
                chrBufIndex++;
2996
                assert(chrBufIndex < 2*vChrBufSize);
2997
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2998
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2999
                //FIXME replace parameters through context struct (some at least)
3000

    
3001
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3002
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3003
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3004
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3005
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3006
                lastInChrBuf++;
3007
            }
3008
            //wrap buf index around to stay inside the ring buffer
3009
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3010
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3011
        }
3012
        else // not enough lines left in this slice -> load the rest in the buffer
3013
        {
3014
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3015
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3016
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3017
            vChrBufSize, vLumBufSize);*/
3018

    
3019
            //Do horizontal scaling
3020
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3021
            {
3022
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3023
                lumBufIndex++;
3024
                assert(lumBufIndex < 2*vLumBufSize);
3025
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3026
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3027
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3028
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3029
                                funnyYCode, c->srcFormat, formatConvBuffer,
3030
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3031
                lastInLumBuf++;
3032
            }
3033
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3034
            {
3035
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3036
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3037
                chrBufIndex++;
3038
                assert(chrBufIndex < 2*vChrBufSize);
3039
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3040
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3041

    
3042
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3043
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3044
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3045
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3046
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3047
                lastInChrBuf++;
3048
            }
3049
            //wrap buf index around to stay inside the ring buffer
3050
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3051
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3052
            break; //we can't output a dstY line so let's try with the next slice
3053
        }
3054

    
3055
#ifdef HAVE_MMX
3056
        b5Dither= ff_dither8[dstY&1];
3057
        g6Dither= ff_dither4[dstY&1];
3058
        g5Dither= ff_dither8[dstY&1];
3059
        r5Dither= ff_dither8[(dstY+1)&1];
3060
#endif
3061
        if (dstY < dstH-2)
3062
        {
3063
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3064
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3065
#ifdef HAVE_MMX
3066
            int i;
3067
        if (flags & SWS_ACCURATE_RND){
3068
            int s= APCK_SIZE / 8;
3069
            for (i=0; i<vLumFilterSize; i+=2){
3070
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
3071
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
3072
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
3073
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
3074
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3075
            }
3076
            for (i=0; i<vChrFilterSize; i+=2){
3077
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
3078
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
3079
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
3080
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3081
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3082
            }
3083
        }else{
3084
            for (i=0; i<vLumFilterSize; i++)
3085
            {
3086
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3087
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3088
                lumMmxFilter[4*i+2]=
3089
                lumMmxFilter[4*i+3]=
3090
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3091
            }
3092
            for (i=0; i<vChrFilterSize; i++)
3093
            {
3094
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3095
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3096
                chrMmxFilter[4*i+2]=
3097
                chrMmxFilter[4*i+3]=
3098
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3099
            }
3100
        }
3101
#endif
3102
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3103
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3104
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3105
                RENAME(yuv2nv12X)(c,
3106
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3107
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3108
                    dest, uDest, dstW, chrDstW, dstFormat);
3109
            }
3110
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3111
            {
3112
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3113
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3114
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3115
                {
3116
                    int16_t *lumBuf = lumPixBuf[0];
3117
                    int16_t *chrBuf= chrPixBuf[0];
3118
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3119
                }
3120
                else //General YV12
3121
                {
3122
                    RENAME(yuv2yuvX)(c,
3123
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3124
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3125
                        dest, uDest, vDest, dstW, chrDstW);
3126
                }
3127
            }
3128
            else
3129
            {
3130
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3131
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3132
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3133
                {
3134
                    int chrAlpha= vChrFilter[2*dstY+1];
3135
                    if(flags & SWS_FULL_CHR_H_INT){
3136
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3137
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3138
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3139
                            dest, dstW, dstY);
3140
                    }else{
3141
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3142
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3143
                    }
3144
                }
3145
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3146
                {
3147
                    int lumAlpha= vLumFilter[2*dstY+1];
3148
                    int chrAlpha= vChrFilter[2*dstY+1];
3149
                    lumMmxFilter[2]=
3150
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3151
                    chrMmxFilter[2]=
3152
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3153
                    if(flags & SWS_FULL_CHR_H_INT){
3154
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3155
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3156
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3157
                            dest, dstW, dstY);
3158
                    }else{
3159
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3160
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3161
                    }
3162
                }
3163
                else //general RGB
3164
                {
3165
                    if(flags & SWS_FULL_CHR_H_INT){
3166
                        yuv2rgbXinC_full(c,
3167
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3168
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3169
                            dest, dstW, dstY);
3170
                    }else{
3171
                    RENAME(yuv2packedX)(c,
3172
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3173
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3174
                        dest, dstW, dstY);
3175
                    }
3176
                }
3177
            }
3178
        }
3179
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3180
        {
3181
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3182
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3183
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3184
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3185
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3186
                yuv2nv12XinC(
3187
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3188
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3189
                    dest, uDest, dstW, chrDstW, dstFormat);
3190
            }
3191
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3192
            {
3193
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3194
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3195
                yuv2yuvXinC(
3196
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3197
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3198
                    dest, uDest, vDest, dstW, chrDstW);
3199
            }
3200
            else
3201
            {
3202
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3203
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3204
                if(flags & SWS_FULL_CHR_H_INT){
3205
                    yuv2rgbXinC_full(c,
3206
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3207
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3208
                        dest, dstW, dstY);
3209
                }else{
3210
                yuv2packedXinC(c,
3211
                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3212
                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3213
                    dest, dstW, dstY);
3214
                }
3215
            }
3216
        }
3217
    }
3218

    
3219
#ifdef HAVE_MMX
3220
    asm volatile(SFENCE:::"memory");
3221
    asm volatile(EMMS:::"memory");
3222
#endif
3223
    /* store changed local vars back in the context */
3224
    c->dstY= dstY;
3225
    c->lumBufIndex= lumBufIndex;
3226
    c->chrBufIndex= chrBufIndex;
3227
    c->lastInLumBuf= lastInLumBuf;
3228
    c->lastInChrBuf= lastInChrBuf;
3229

    
3230
    return dstY - lastDstY;
3231
}