Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 896a22b8

History | View | Annotate | Download (137 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddsw             %%mm7, %%mm0      \n\t"\
194
    "paddsw             %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX \
210
    asm volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232
\
233
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236
    "movq                      %%mm1, %%mm7         \n\t"\
237
    ASMALIGN(4)\
238
    "2:                                             \n\t"\
239
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242
    "add                         $16, %%"REG_d"            \n\t"\
243
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244
    "pmulhw                    %%mm0, %%mm2         \n\t"\
245
    "pmulhw                    %%mm0, %%mm5         \n\t"\
246
    "paddw                     %%mm2, %%mm1         \n\t"\
247
    "paddw                     %%mm5, %%mm7         \n\t"\
248
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
249
    " jnz                         2b                \n\t"\
250

    
251
#define YSCALEYUV2PACKEDX_END                 \
252
    :: "r" (&c->redDither),                   \
253
        "m" (dummy), "m" (dummy), "m" (dummy),\
254
        "r" (dest), "m" (dstW)                \
255
    : "%"REG_a, "%"REG_d, "%"REG_S            \
256
    );
257

    
258
#define YSCALEYUV2PACKEDX_ACCURATE \
259
    asm volatile(\
260
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
261
    ASMALIGN(4)\
262
    "nop                                            \n\t"\
263
    "1:                                             \n\t"\
264
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266
    "pxor                      %%mm4, %%mm4         \n\t"\
267
    "pxor                      %%mm5, %%mm5         \n\t"\
268
    "pxor                      %%mm6, %%mm6         \n\t"\
269
    "pxor                      %%mm7, %%mm7         \n\t"\
270
    ASMALIGN(4)\
271
    "2:                                             \n\t"\
272
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
275
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276
    "movq                      %%mm0, %%mm3         \n\t"\
277
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
278
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
279
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
280
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
282
    "paddd                     %%mm0, %%mm4         \n\t"\
283
    "paddd                     %%mm3, %%mm5         \n\t"\
284
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
286
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
287
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
288
    "movq                      %%mm2, %%mm0         \n\t"\
289
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
290
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
291
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
292
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
293
    "paddd                     %%mm2, %%mm6         \n\t"\
294
    "paddd                     %%mm0, %%mm7         \n\t"\
295
    " jnz                         2b                \n\t"\
296
    "psrad                       $16, %%mm4         \n\t"\
297
    "psrad                       $16, %%mm5         \n\t"\
298
    "psrad                       $16, %%mm6         \n\t"\
299
    "psrad                       $16, %%mm7         \n\t"\
300
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301
    "packssdw                  %%mm5, %%mm4         \n\t"\
302
    "packssdw                  %%mm7, %%mm6         \n\t"\
303
    "paddw                     %%mm0, %%mm4         \n\t"\
304
    "paddw                     %%mm0, %%mm6         \n\t"\
305
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307
\
308
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310
    "pxor                      %%mm1, %%mm1         \n\t"\
311
    "pxor                      %%mm5, %%mm5         \n\t"\
312
    "pxor                      %%mm7, %%mm7         \n\t"\
313
    "pxor                      %%mm6, %%mm6         \n\t"\
314
    ASMALIGN(4)\
315
    "2:                                             \n\t"\
316
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
319
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320
    "movq                      %%mm0, %%mm3         \n\t"\
321
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
322
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
323
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
325
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
326
    "paddd                     %%mm0, %%mm1         \n\t"\
327
    "paddd                     %%mm3, %%mm5         \n\t"\
328
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
330
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
331
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
332
    "movq                      %%mm2, %%mm0         \n\t"\
333
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
334
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
335
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
336
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
337
    "paddd                     %%mm2, %%mm7         \n\t"\
338
    "paddd                     %%mm0, %%mm6         \n\t"\
339
    " jnz                         2b                \n\t"\
340
    "psrad                       $16, %%mm1         \n\t"\
341
    "psrad                       $16, %%mm5         \n\t"\
342
    "psrad                       $16, %%mm7         \n\t"\
343
    "psrad                       $16, %%mm6         \n\t"\
344
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345
    "packssdw                  %%mm5, %%mm1         \n\t"\
346
    "packssdw                  %%mm6, %%mm7         \n\t"\
347
    "paddw                     %%mm0, %%mm1         \n\t"\
348
    "paddw                     %%mm0, %%mm7         \n\t"\
349
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351

    
352
#define YSCALEYUV2RGBX \
353
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367
    "paddw           %%mm3, %%mm4       \n\t"\
368
    "movq            %%mm2, %%mm0       \n\t"\
369
    "movq            %%mm5, %%mm6       \n\t"\
370
    "movq            %%mm4, %%mm3       \n\t"\
371
    "punpcklwd       %%mm2, %%mm2       \n\t"\
372
    "punpcklwd       %%mm5, %%mm5       \n\t"\
373
    "punpcklwd       %%mm4, %%mm4       \n\t"\
374
    "paddw           %%mm1, %%mm2       \n\t"\
375
    "paddw           %%mm1, %%mm5       \n\t"\
376
    "paddw           %%mm1, %%mm4       \n\t"\
377
    "punpckhwd       %%mm0, %%mm0       \n\t"\
378
    "punpckhwd       %%mm6, %%mm6       \n\t"\
379
    "punpckhwd       %%mm3, %%mm3       \n\t"\
380
    "paddw           %%mm7, %%mm0       \n\t"\
381
    "paddw           %%mm7, %%mm6       \n\t"\
382
    "paddw           %%mm7, %%mm3       \n\t"\
383
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384
    "packuswb        %%mm0, %%mm2       \n\t"\
385
    "packuswb        %%mm6, %%mm5       \n\t"\
386
    "packuswb        %%mm3, %%mm4       \n\t"\
387
    "pxor            %%mm7, %%mm7       \n\t"
388
#if 0
389
#define FULL_YSCALEYUV2RGB \
390
    "pxor                 %%mm7, %%mm7  \n\t"\
391
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392
    "punpcklwd            %%mm6, %%mm6  \n\t"\
393
    "punpcklwd            %%mm6, %%mm6  \n\t"\
394
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395
    "punpcklwd            %%mm5, %%mm5  \n\t"\
396
    "punpcklwd            %%mm5, %%mm5  \n\t"\
397
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
398
    ASMALIGN(4)\
399
    "1:                                 \n\t"\
400
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418
\
419
\
420
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427
\
428
\
429
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434
    "packuswb             %%mm3, %%mm3  \n\t"\
435
\
436
    "packuswb             %%mm0, %%mm0  \n\t"\
437
    "paddw                %%mm4, %%mm2  \n\t"\
438
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439
\
440
    "packuswb             %%mm1, %%mm1  \n\t"
441
#endif
442

    
443
#define REAL_YSCALEYUV2PACKED(index, c) \
444
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
445
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
446
    "psraw                $3, %%mm0                           \n\t"\
447
    "psraw                $3, %%mm1                           \n\t"\
448
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450
    "xor            "#index", "#index"                        \n\t"\
451
    ASMALIGN(4)\
452
    "1:                                 \n\t"\
453
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
454
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
455
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
456
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
457
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
460
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
467
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
468
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
469
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
470
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
471
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
472
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478

    
479
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
480

    
481
#define REAL_YSCALEYUV2RGB(index, c) \
482
    "xor            "#index", "#index"  \n\t"\
483
    ASMALIGN(4)\
484
    "1:                                 \n\t"\
485
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
486
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
487
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
488
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
489
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
492
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
499
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
500
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
501
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
502
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
503
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
504
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
506
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
507
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
508
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
509
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
510
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
511
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
518
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
519
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
520
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
521
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
522
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
523
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524
    "paddw             %%mm3, %%mm4     \n\t"\
525
    "movq              %%mm2, %%mm0     \n\t"\
526
    "movq              %%mm5, %%mm6     \n\t"\
527
    "movq              %%mm4, %%mm3     \n\t"\
528
    "punpcklwd         %%mm2, %%mm2     \n\t"\
529
    "punpcklwd         %%mm5, %%mm5     \n\t"\
530
    "punpcklwd         %%mm4, %%mm4     \n\t"\
531
    "paddw             %%mm1, %%mm2     \n\t"\
532
    "paddw             %%mm1, %%mm5     \n\t"\
533
    "paddw             %%mm1, %%mm4     \n\t"\
534
    "punpckhwd         %%mm0, %%mm0     \n\t"\
535
    "punpckhwd         %%mm6, %%mm6     \n\t"\
536
    "punpckhwd         %%mm3, %%mm3     \n\t"\
537
    "paddw             %%mm7, %%mm0     \n\t"\
538
    "paddw             %%mm7, %%mm6     \n\t"\
539
    "paddw             %%mm7, %%mm3     \n\t"\
540
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541
    "packuswb          %%mm0, %%mm2     \n\t"\
542
    "packuswb          %%mm6, %%mm5     \n\t"\
543
    "packuswb          %%mm3, %%mm4     \n\t"\
544
    "pxor              %%mm7, %%mm7     \n\t"
545
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
546

    
547
#define REAL_YSCALEYUV2PACKED1(index, c) \
548
    "xor            "#index", "#index"  \n\t"\
549
    ASMALIGN(4)\
550
    "1:                                 \n\t"\
551
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
552
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
553
    "psraw                $7, %%mm3     \n\t" \
554
    "psraw                $7, %%mm4     \n\t" \
555
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
556
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
557
    "psraw                $7, %%mm1     \n\t" \
558
    "psraw                $7, %%mm7     \n\t" \
559

    
560
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
561

    
562
#define REAL_YSCALEYUV2RGB1(index, c) \
563
    "xor            "#index", "#index"  \n\t"\
564
    ASMALIGN(4)\
565
    "1:                                 \n\t"\
566
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
568
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
571
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
572
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
573
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
574
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
575
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
576
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
578
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
579
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
582
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
583
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
584
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
585
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
586
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
587
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588
    "paddw             %%mm3, %%mm4     \n\t"\
589
    "movq              %%mm2, %%mm0     \n\t"\
590
    "movq              %%mm5, %%mm6     \n\t"\
591
    "movq              %%mm4, %%mm3     \n\t"\
592
    "punpcklwd         %%mm2, %%mm2     \n\t"\
593
    "punpcklwd         %%mm5, %%mm5     \n\t"\
594
    "punpcklwd         %%mm4, %%mm4     \n\t"\
595
    "paddw             %%mm1, %%mm2     \n\t"\
596
    "paddw             %%mm1, %%mm5     \n\t"\
597
    "paddw             %%mm1, %%mm4     \n\t"\
598
    "punpckhwd         %%mm0, %%mm0     \n\t"\
599
    "punpckhwd         %%mm6, %%mm6     \n\t"\
600
    "punpckhwd         %%mm3, %%mm3     \n\t"\
601
    "paddw             %%mm7, %%mm0     \n\t"\
602
    "paddw             %%mm7, %%mm6     \n\t"\
603
    "paddw             %%mm7, %%mm3     \n\t"\
604
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605
    "packuswb          %%mm0, %%mm2     \n\t"\
606
    "packuswb          %%mm6, %%mm5     \n\t"\
607
    "packuswb          %%mm3, %%mm4     \n\t"\
608
    "pxor              %%mm7, %%mm7     \n\t"
609
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
610

    
611
#define REAL_YSCALEYUV2PACKED1b(index, c) \
612
    "xor "#index", "#index"             \n\t"\
613
    ASMALIGN(4)\
614
    "1:                                 \n\t"\
615
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
616
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
617
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
618
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
619
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621
    "psrlw                $8, %%mm3     \n\t" \
622
    "psrlw                $8, %%mm4     \n\t" \
623
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
624
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
625
    "psraw                $7, %%mm1     \n\t" \
626
    "psraw                $7, %%mm7     \n\t"
627
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
628

    
629
// do vertical chrominance interpolation
630
#define REAL_YSCALEYUV2RGB1b(index, c) \
631
    "xor            "#index", "#index"  \n\t"\
632
    ASMALIGN(4)\
633
    "1:                                 \n\t"\
634
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
635
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
636
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
637
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
638
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
641
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
642
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
643
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
644
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
645
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
646
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
647
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
648
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
650
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
651
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
654
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
655
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
656
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
657
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
658
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
659
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660
    "paddw             %%mm3, %%mm4     \n\t"\
661
    "movq              %%mm2, %%mm0     \n\t"\
662
    "movq              %%mm5, %%mm6     \n\t"\
663
    "movq              %%mm4, %%mm3     \n\t"\
664
    "punpcklwd         %%mm2, %%mm2     \n\t"\
665
    "punpcklwd         %%mm5, %%mm5     \n\t"\
666
    "punpcklwd         %%mm4, %%mm4     \n\t"\
667
    "paddw             %%mm1, %%mm2     \n\t"\
668
    "paddw             %%mm1, %%mm5     \n\t"\
669
    "paddw             %%mm1, %%mm4     \n\t"\
670
    "punpckhwd         %%mm0, %%mm0     \n\t"\
671
    "punpckhwd         %%mm6, %%mm6     \n\t"\
672
    "punpckhwd         %%mm3, %%mm3     \n\t"\
673
    "paddw             %%mm7, %%mm0     \n\t"\
674
    "paddw             %%mm7, %%mm6     \n\t"\
675
    "paddw             %%mm7, %%mm3     \n\t"\
676
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677
    "packuswb          %%mm0, %%mm2     \n\t"\
678
    "packuswb          %%mm6, %%mm5     \n\t"\
679
    "packuswb          %%mm3, %%mm4     \n\t"\
680
    "pxor              %%mm7, %%mm7     \n\t"
681
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
682

    
683
#define REAL_WRITEBGR32(dst, dstw, index) \
684
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685
    "movq      %%mm2, %%mm1     \n\t" /* B */\
686
    "movq      %%mm5, %%mm6     \n\t" /* R */\
687
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
688
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
689
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
690
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
691
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
692
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
693
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
694
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
695
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
696
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
697
\
698
    MOVNTQ(%%mm0,   (dst, index, 4))\
699
    MOVNTQ(%%mm2,  8(dst, index, 4))\
700
    MOVNTQ(%%mm1, 16(dst, index, 4))\
701
    MOVNTQ(%%mm3, 24(dst, index, 4))\
702
\
703
    "add      $8, "#index"      \n\t"\
704
    "cmp "#dstw", "#index"      \n\t"\
705
    " jb      1b                \n\t"
706
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
707

    
708
#define REAL_WRITERGB16(dst, dstw, index) \
709
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
710
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
711
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
712
    "psrlq           $3, %%mm2  \n\t"\
713
\
714
    "movq         %%mm2, %%mm1  \n\t"\
715
    "movq         %%mm4, %%mm3  \n\t"\
716
\
717
    "punpcklbw    %%mm7, %%mm3  \n\t"\
718
    "punpcklbw    %%mm5, %%mm2  \n\t"\
719
    "punpckhbw    %%mm7, %%mm4  \n\t"\
720
    "punpckhbw    %%mm5, %%mm1  \n\t"\
721
\
722
    "psllq           $3, %%mm3  \n\t"\
723
    "psllq           $3, %%mm4  \n\t"\
724
\
725
    "por          %%mm3, %%mm2  \n\t"\
726
    "por          %%mm4, %%mm1  \n\t"\
727
\
728
    MOVNTQ(%%mm2,  (dst, index, 2))\
729
    MOVNTQ(%%mm1, 8(dst, index, 2))\
730
\
731
    "add             $8, "#index"   \n\t"\
732
    "cmp        "#dstw", "#index"   \n\t"\
733
    " jb             1b             \n\t"
734
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
735

    
736
#define REAL_WRITERGB15(dst, dstw, index) \
737
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
738
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
739
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
740
    "psrlq           $3, %%mm2  \n\t"\
741
    "psrlq           $1, %%mm5  \n\t"\
742
\
743
    "movq         %%mm2, %%mm1  \n\t"\
744
    "movq         %%mm4, %%mm3  \n\t"\
745
\
746
    "punpcklbw    %%mm7, %%mm3  \n\t"\
747
    "punpcklbw    %%mm5, %%mm2  \n\t"\
748
    "punpckhbw    %%mm7, %%mm4  \n\t"\
749
    "punpckhbw    %%mm5, %%mm1  \n\t"\
750
\
751
    "psllq           $2, %%mm3  \n\t"\
752
    "psllq           $2, %%mm4  \n\t"\
753
\
754
    "por          %%mm3, %%mm2  \n\t"\
755
    "por          %%mm4, %%mm1  \n\t"\
756
\
757
    MOVNTQ(%%mm2,  (dst, index, 2))\
758
    MOVNTQ(%%mm1, 8(dst, index, 2))\
759
\
760
    "add             $8, "#index"   \n\t"\
761
    "cmp        "#dstw", "#index"   \n\t"\
762
    " jb             1b             \n\t"
763
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
764

    
765
#define WRITEBGR24OLD(dst, dstw, index) \
766
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767
    "movq      %%mm2, %%mm1             \n\t" /* B */\
768
    "movq      %%mm5, %%mm6             \n\t" /* R */\
769
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
770
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
771
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
772
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
773
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
774
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
775
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
776
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
777
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
778
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
779
\
780
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
781
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
782
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
783
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
784
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
785
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
786
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
787
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
788
\
789
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
790
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
791
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
792
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
793
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
794
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
795
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
796
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
797
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
798
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
799
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
800
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
801
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
802
\
803
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
804
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
805
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
806
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
807
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
808
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
809
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
810
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
811
\
812
    MOVNTQ(%%mm0,   (dst))\
813
    MOVNTQ(%%mm2,  8(dst))\
814
    MOVNTQ(%%mm3, 16(dst))\
815
    "add         $24, "#dst"            \n\t"\
816
\
817
    "add          $8, "#index"          \n\t"\
818
    "cmp     "#dstw", "#index"          \n\t"\
819
    " jb          1b                    \n\t"
820

    
821
#define WRITEBGR24MMX(dst, dstw, index) \
822
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
    "movq      %%mm2, %%mm1     \n\t" /* B */\
824
    "movq      %%mm5, %%mm6     \n\t" /* R */\
825
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
826
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
827
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
828
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
829
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
830
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
831
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
832
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
833
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
834
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
835
\
836
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
837
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
838
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
839
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
840
\
841
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
842
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
843
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
844
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
845
\
846
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
847
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
848
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
849
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
850
\
851
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
852
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
853
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
854
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
855
    MOVNTQ(%%mm0, (dst))\
856
\
857
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
858
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
859
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
860
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
861
    MOVNTQ(%%mm6, 8(dst))\
862
\
863
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
864
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
865
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
866
    MOVNTQ(%%mm5, 16(dst))\
867
\
868
    "add         $24, "#dst"    \n\t"\
869
\
870
    "add          $8, "#index"  \n\t"\
871
    "cmp     "#dstw", "#index"  \n\t"\
872
    " jb          1b            \n\t"
873

    
874
#define WRITEBGR24MMX2(dst, dstw, index) \
875
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
879
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
880
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
881
\
882
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
883
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
884
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
885
\
886
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
887
    "por    %%mm1, %%mm6        \n\t"\
888
    "por    %%mm3, %%mm6        \n\t"\
889
    MOVNTQ(%%mm6, (dst))\
890
\
891
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
892
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
893
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
894
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
895
\
896
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
897
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
898
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
899
\
900
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
901
    "por    %%mm3, %%mm6        \n\t"\
902
    MOVNTQ(%%mm6, 8(dst))\
903
\
904
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
905
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
906
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
907
\
908
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
909
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
910
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
911
\
912
    "por    %%mm1, %%mm3        \n\t"\
913
    "por    %%mm3, %%mm6        \n\t"\
914
    MOVNTQ(%%mm6, 16(dst))\
915
\
916
    "add      $24, "#dst"       \n\t"\
917
\
918
    "add       $8, "#index"     \n\t"\
919
    "cmp  "#dstw", "#index"     \n\t"\
920
    " jb       1b               \n\t"
921

    
922
#ifdef HAVE_MMX2
923
#undef WRITEBGR24
924
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
925
#else
926
#undef WRITEBGR24
927
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
928
#endif
929

    
930
#define REAL_WRITEYUY2(dst, dstw, index) \
931
    "packuswb  %%mm3, %%mm3     \n\t"\
932
    "packuswb  %%mm4, %%mm4     \n\t"\
933
    "packuswb  %%mm7, %%mm1     \n\t"\
934
    "punpcklbw %%mm4, %%mm3     \n\t"\
935
    "movq      %%mm1, %%mm7     \n\t"\
936
    "punpcklbw %%mm3, %%mm1     \n\t"\
937
    "punpckhbw %%mm3, %%mm7     \n\t"\
938
\
939
    MOVNTQ(%%mm1, (dst, index, 2))\
940
    MOVNTQ(%%mm7, 8(dst, index, 2))\
941
\
942
    "add          $8, "#index"  \n\t"\
943
    "cmp     "#dstw", "#index"  \n\t"\
944
    " jb          1b            \n\t"
945
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
946

    
947

    
948
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951
{
952
#ifdef HAVE_MMX
953
    if(!(c->flags & SWS_BITEXACT)){
954
        if (c->flags & SWS_ACCURATE_RND){
955
            if (uDest){
956
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
957
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
958
            }
959

    
960
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
961
        }else{
962
            if (uDest){
963
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
964
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
965
            }
966

    
967
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
968
        }
969
        return;
970
    }
971
#endif
972
#ifdef HAVE_ALTIVEC
973
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
974
                      chrFilter, chrSrc, chrFilterSize,
975
                      dest, uDest, vDest, dstW, chrDstW);
976
#else //HAVE_ALTIVEC
977
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
978
            chrFilter, chrSrc, chrFilterSize,
979
            dest, uDest, vDest, dstW, chrDstW);
980
#endif //!HAVE_ALTIVEC
981
}
982

    
983
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
984
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
985
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
986
{
987
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
988
             chrFilter, chrSrc, chrFilterSize,
989
             dest, uDest, dstW, chrDstW, dstFormat);
990
}
991

    
992
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
993
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
994
{
995
    int i;
996
#ifdef HAVE_MMX
997
    if(!(c->flags & SWS_BITEXACT)){
998
        long p= uDest ? 3 : 1;
999
        uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
1000
        uint8_t *dst[3]= {dest, uDest, vDest};
1001
        long counter[3] = {dstW, chrDstW, chrDstW};
1002

    
1003
        if (c->flags & SWS_ACCURATE_RND){
1004
            while(p--){
1005
                asm volatile(
1006
                    YSCALEYUV2YV121_ACCURATE
1007
                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
1008
                    "g" (-counter[p])
1009
                    : "%"REG_a
1010
                );
1011
            }
1012
        }else{
1013
            while(p--){
1014
                asm volatile(
1015
                    YSCALEYUV2YV121
1016
                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
1017
                    "g" (-counter[p])
1018
                    : "%"REG_a
1019
                );
1020
            }
1021
        }
1022
        return;
1023
    }
1024
#endif
1025
    for (i=0; i<dstW; i++)
1026
    {
1027
        int val= (lumSrc[i]+64)>>7;
1028

    
1029
        if (val&256){
1030
            if (val<0) val=0;
1031
            else       val=255;
1032
        }
1033

    
1034
        dest[i]= val;
1035
    }
1036

    
1037
    if (uDest)
1038
        for (i=0; i<chrDstW; i++)
1039
        {
1040
            int u=(chrSrc[i       ]+64)>>7;
1041
            int v=(chrSrc[i + VOFW]+64)>>7;
1042

    
1043
            if ((u|v)&256){
1044
                if (u<0)        u=0;
1045
                else if (u>255) u=255;
1046
                if (v<0)        v=0;
1047
                else if (v>255) v=255;
1048
            }
1049

    
1050
            uDest[i]= u;
1051
            vDest[i]= v;
1052
        }
1053
}
1054

    
1055

    
1056
/**
1057
 * vertical scale YV12 to RGB
1058
 */
1059
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1060
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1061
                                       uint8_t *dest, long dstW, long dstY)
1062
{
1063
#ifdef HAVE_MMX
1064
    long dummy=0;
1065
    if(!(c->flags & SWS_BITEXACT)){
1066
        if (c->flags & SWS_ACCURATE_RND){
1067
            switch(c->dstFormat){
1068
            case PIX_FMT_RGB32:
1069
                YSCALEYUV2PACKEDX_ACCURATE
1070
                YSCALEYUV2RGBX
1071
                WRITEBGR32(%4, %5, %%REGa)
1072

    
1073
                YSCALEYUV2PACKEDX_END
1074
                return;
1075
            case PIX_FMT_BGR24:
1076
                YSCALEYUV2PACKEDX_ACCURATE
1077
                YSCALEYUV2RGBX
1078
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1079
                "add %4, %%"REG_c"                        \n\t"
1080
                WRITEBGR24(%%REGc, %5, %%REGa)
1081

    
1082

    
1083
                :: "r" (&c->redDither),
1084
                "m" (dummy), "m" (dummy), "m" (dummy),
1085
                "r" (dest), "m" (dstW)
1086
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1087
                );
1088
                return;
1089
            case PIX_FMT_RGB555:
1090
                YSCALEYUV2PACKEDX_ACCURATE
1091
                YSCALEYUV2RGBX
1092
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1093
#ifdef DITHER1XBPP
1094
                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1095
                "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1096
                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1097
#endif
1098

    
1099
                WRITERGB15(%4, %5, %%REGa)
1100
                YSCALEYUV2PACKEDX_END
1101
                return;
1102
            case PIX_FMT_RGB565:
1103
                YSCALEYUV2PACKEDX_ACCURATE
1104
                YSCALEYUV2RGBX
1105
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1106
#ifdef DITHER1XBPP
1107
                "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1108
                "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1109
                "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1110
#endif
1111

    
1112
                WRITERGB16(%4, %5, %%REGa)
1113
                YSCALEYUV2PACKEDX_END
1114
                return;
1115
            case PIX_FMT_YUYV422:
1116
                YSCALEYUV2PACKEDX_ACCURATE
1117
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1118

    
1119
                "psraw $3, %%mm3    \n\t"
1120
                "psraw $3, %%mm4    \n\t"
1121
                "psraw $3, %%mm1    \n\t"
1122
                "psraw $3, %%mm7    \n\t"
1123
                WRITEYUY2(%4, %5, %%REGa)
1124
                YSCALEYUV2PACKEDX_END
1125
                return;
1126
            }
1127
        }else{
1128
            switch(c->dstFormat)
1129
            {
1130
            case PIX_FMT_RGB32:
1131
                YSCALEYUV2PACKEDX
1132
                YSCALEYUV2RGBX
1133
                WRITEBGR32(%4, %5, %%REGa)
1134
                YSCALEYUV2PACKEDX_END
1135
                return;
1136
            case PIX_FMT_BGR24:
1137
                YSCALEYUV2PACKEDX
1138
                YSCALEYUV2RGBX
1139
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1140
                "add                        %4, %%"REG_c"   \n\t"
1141
                WRITEBGR24(%%REGc, %5, %%REGa)
1142

    
1143
                :: "r" (&c->redDither),
1144
                "m" (dummy), "m" (dummy), "m" (dummy),
1145
                "r" (dest),  "m" (dstW)
1146
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1147
                );
1148
                return;
1149
            case PIX_FMT_RGB555:
1150
                YSCALEYUV2PACKEDX
1151
                YSCALEYUV2RGBX
1152
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1153
#ifdef DITHER1XBPP
1154
                "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1155
                "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1156
                "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1157
#endif
1158

    
1159
                WRITERGB15(%4, %5, %%REGa)
1160
                YSCALEYUV2PACKEDX_END
1161
                return;
1162
            case PIX_FMT_RGB565:
1163
                YSCALEYUV2PACKEDX
1164
                YSCALEYUV2RGBX
1165
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166
#ifdef DITHER1XBPP
1167
                "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1168
                "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1169
                "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1170
#endif
1171

    
1172
                WRITERGB16(%4, %5, %%REGa)
1173
                YSCALEYUV2PACKEDX_END
1174
                return;
1175
            case PIX_FMT_YUYV422:
1176
                YSCALEYUV2PACKEDX
1177
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178

    
1179
                "psraw $3, %%mm3    \n\t"
1180
                "psraw $3, %%mm4    \n\t"
1181
                "psraw $3, %%mm1    \n\t"
1182
                "psraw $3, %%mm7    \n\t"
1183
                WRITEYUY2(%4, %5, %%REGa)
1184
                YSCALEYUV2PACKEDX_END
1185
                return;
1186
            }
1187
        }
1188
    }
1189
#endif /* HAVE_MMX */
1190
#ifdef HAVE_ALTIVEC
1191
    /* The following list of supported dstFormat values should
1192
       match what's found in the body of altivec_yuv2packedX() */
1193
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1194
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1195
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1196
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1197
                                 chrFilter, chrSrc, chrFilterSize,
1198
                                 dest, dstW, dstY);
1199
    else
1200
#endif
1201
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1202
                       chrFilter, chrSrc, chrFilterSize,
1203
                       dest, dstW, dstY);
1204
}
1205

    
1206
/**
1207
 * vertical bilinear scale YV12 to RGB
1208
 */
1209
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1210
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1211
{
1212
    int  yalpha1=4095- yalpha;
1213
    int uvalpha1=4095-uvalpha;
1214
    int i;
1215

    
1216
#if 0 //isn't used
1217
    if (flags&SWS_FULL_CHR_H_INT)
1218
    {
1219
        switch(dstFormat)
1220
        {
1221
#ifdef HAVE_MMX
1222
        case PIX_FMT_RGB32:
1223
            asm volatile(
1224

1225

1226
FULL_YSCALEYUV2RGB
1227
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1228
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1229

1230
            "movq      %%mm3, %%mm1    \n\t"
1231
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1232
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1233

1234
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1235
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1236

1237
            "add $4, %%"REG_a"  \n\t"
1238
            "cmp %5, %%"REG_a"  \n\t"
1239
            " jb 1b             \n\t"
1240

1241
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1242
            "m" (yalpha1), "m" (uvalpha1)
1243
            : "%"REG_a
1244
            );
1245
            break;
1246
        case PIX_FMT_BGR24:
1247
            asm volatile(
1248

1249
FULL_YSCALEYUV2RGB
1250

1251
                                              // lsb ... msb
1252
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1253
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1254

1255
            "movq      %%mm3, %%mm1     \n\t"
1256
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1257
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1258

1259
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1260
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1261
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1262
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1263
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1264
            "movq      %%mm1, %%mm2     \n\t"
1265
            "psllq       $48, %%mm1     \n\t" // 000000BG
1266
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1267

1268
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1269
            "psrld       $16, %%mm2     \n\t" // R000R000
1270
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1271
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1272

1273
            "mov          %4, %%"REG_b" \n\t"
1274
            "add   %%"REG_a", %%"REG_b" \n\t"
1275

1276
#ifdef HAVE_MMX2
1277
            //FIXME Alignment
1278
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1279
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1280
#else
1281
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1282
            "psrlq  $32, %%mm3                          \n\t"
1283
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1284
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1285
#endif
1286
            "add     $4, %%"REG_a"                      \n\t"
1287
            "cmp     %5, %%"REG_a"                      \n\t"
1288
            " jb     1b                                 \n\t"
1289

    
1290
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1291
            "m" (yalpha1), "m" (uvalpha1)
1292
            : "%"REG_a, "%"REG_b
1293
            );
1294
            break;
1295
        case PIX_FMT_BGR555:
1296
            asm volatile(
1297

    
1298
FULL_YSCALEYUV2RGB
1299
#ifdef DITHER1XBPP
1300
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1301
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1302
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1303
#endif
1304
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1305
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1306
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1307

    
1308
            "psrlw                   $3, %%mm3  \n\t"
1309
            "psllw                   $2, %%mm1  \n\t"
1310
            "psllw                   $7, %%mm0  \n\t"
1311
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1312
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1313

    
1314
            "por                  %%mm3, %%mm1  \n\t"
1315
            "por                  %%mm1, %%mm0  \n\t"
1316

    
1317
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1318

    
1319
            "add $4, %%"REG_a"  \n\t"
1320
            "cmp %5, %%"REG_a"  \n\t"
1321
            " jb 1b             \n\t"
1322

    
1323
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1324
            "m" (yalpha1), "m" (uvalpha1)
1325
            : "%"REG_a
1326
            );
1327
            break;
1328
        case PIX_FMT_BGR565:
1329
            asm volatile(
1330

    
1331
FULL_YSCALEYUV2RGB
1332
#ifdef DITHER1XBPP
1333
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1334
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1335
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1336
#endif
1337
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1338
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1339
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1340

    
1341
            "psrlw                   $3, %%mm3  \n\t"
1342
            "psllw                   $3, %%mm1  \n\t"
1343
            "psllw                   $8, %%mm0  \n\t"
1344
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1345
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1346

    
1347
            "por                  %%mm3, %%mm1  \n\t"
1348
            "por                  %%mm1, %%mm0  \n\t"
1349

    
1350
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1351

    
1352
            "add $4, %%"REG_a"  \n\t"
1353
            "cmp %5, %%"REG_a"  \n\t"
1354
            " jb 1b             \n\t"
1355

    
1356
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1357
            "m" (yalpha1), "m" (uvalpha1)
1358
            : "%"REG_a
1359
            );
1360
            break;
1361
#endif /* HAVE_MMX */
1362
        case PIX_FMT_BGR32:
1363
#ifndef HAVE_MMX
1364
        case PIX_FMT_RGB32:
1365
#endif
1366
            if (dstFormat==PIX_FMT_RGB32)
1367
            {
1368
                int i;
1369
#ifdef WORDS_BIGENDIAN
1370
                dest++;
1371
#endif
1372
                for (i=0;i<dstW;i++){
1373
                    // vertical linear interpolation && yuv2rgb in a single step:
1374
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1375
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1376
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1377
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1378
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1379
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1380
                    dest+= 4;
1381
                }
1382
            }
1383
            else if (dstFormat==PIX_FMT_BGR24)
1384
            {
1385
                int i;
1386
                for (i=0;i<dstW;i++){
1387
                    // vertical linear interpolation && yuv2rgb in a single step:
1388
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1389
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1390
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1391
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1392
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1393
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1394
                    dest+= 3;
1395
                }
1396
            }
1397
            else if (dstFormat==PIX_FMT_BGR565)
1398
            {
1399
                int i;
1400
                for (i=0;i<dstW;i++){
1401
                    // vertical linear interpolation && yuv2rgb in a single step:
1402
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1403
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1404
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1405

    
1406
                    ((uint16_t*)dest)[i] =
1407
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1408
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1409
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1410
                }
1411
            }
1412
            else if (dstFormat==PIX_FMT_BGR555)
1413
            {
1414
                int i;
1415
                for (i=0;i<dstW;i++){
1416
                    // vertical linear interpolation && yuv2rgb in a single step:
1417
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1418
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1419
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1420

    
1421
                    ((uint16_t*)dest)[i] =
1422
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1423
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1424
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1425
                }
1426
            }
1427
        }//FULL_UV_IPOL
1428
    else
1429
    {
1430
#endif // if 0
1431
#ifdef HAVE_MMX
1432
    if(!(c->flags & SWS_BITEXACT)){
1433
        switch(c->dstFormat)
1434
        {
1435
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1436
            case PIX_FMT_RGB32:
1437
                asm volatile(
1438
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1439
                "mov        %4, %%"REG_b"               \n\t"
1440
                "push %%"REG_BP"                        \n\t"
1441
                YSCALEYUV2RGB(%%REGBP, %5)
1442
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1443
                "pop %%"REG_BP"                         \n\t"
1444
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1445

    
1446
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1447
                "a" (&c->redDither)
1448
                );
1449
                return;
1450
            case PIX_FMT_BGR24:
1451
                asm volatile(
1452
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1453
                "mov        %4, %%"REG_b"               \n\t"
1454
                "push %%"REG_BP"                        \n\t"
1455
                YSCALEYUV2RGB(%%REGBP, %5)
1456
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1457
                "pop %%"REG_BP"                         \n\t"
1458
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1460
                "a" (&c->redDither)
1461
                );
1462
                return;
1463
            case PIX_FMT_RGB555:
1464
                asm volatile(
1465
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1466
                "mov        %4, %%"REG_b"               \n\t"
1467
                "push %%"REG_BP"                        \n\t"
1468
                YSCALEYUV2RGB(%%REGBP, %5)
1469
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1470
#ifdef DITHER1XBPP
1471
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1472
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1473
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1474
#endif
1475

    
1476
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1477
                "pop %%"REG_BP"                         \n\t"
1478
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1479

    
1480
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481
                "a" (&c->redDither)
1482
                );
1483
                return;
1484
            case PIX_FMT_RGB565:
1485
                asm volatile(
1486
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1487
                "mov        %4, %%"REG_b"               \n\t"
1488
                "push %%"REG_BP"                        \n\t"
1489
                YSCALEYUV2RGB(%%REGBP, %5)
1490
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1491
#ifdef DITHER1XBPP
1492
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1493
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1494
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1495
#endif
1496

    
1497
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1498
                "pop %%"REG_BP"                         \n\t"
1499
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1500
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1501
                "a" (&c->redDither)
1502
                );
1503
                return;
1504
            case PIX_FMT_YUYV422:
1505
                asm volatile(
1506
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1507
                "mov %4, %%"REG_b"                        \n\t"
1508
                "push %%"REG_BP"                        \n\t"
1509
                YSCALEYUV2PACKED(%%REGBP, %5)
1510
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1511
                "pop %%"REG_BP"                         \n\t"
1512
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1513
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514
                "a" (&c->redDither)
1515
                );
1516
                return;
1517
            default: break;
1518
        }
1519
    }
1520
#endif //HAVE_MMX
1521
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1522
}
1523

    
1524
/**
1525
 * YV12 to RGB without scaling or interpolating
1526
 */
1527
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1528
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1529
{
1530
    const int yalpha1=0;
1531
    int i;
1532

    
1533
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1534
    const int yalpha= 4096; //FIXME ...
1535

    
1536
    if (flags&SWS_FULL_CHR_H_INT)
1537
    {
1538
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1539
        return;
1540
    }
1541

    
1542
#ifdef HAVE_MMX
1543
    if(!(flags & SWS_BITEXACT)){
1544
        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1545
        {
1546
            switch(dstFormat)
1547
            {
1548
            case PIX_FMT_RGB32:
1549
                asm volatile(
1550
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
                "mov        %4, %%"REG_b"               \n\t"
1552
                "push %%"REG_BP"                        \n\t"
1553
                YSCALEYUV2RGB1(%%REGBP, %5)
1554
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1555
                "pop %%"REG_BP"                         \n\t"
1556
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1557

    
1558
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1559
                "a" (&c->redDither)
1560
                );
1561
                return;
1562
            case PIX_FMT_BGR24:
1563
                asm volatile(
1564
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1565
                "mov        %4, %%"REG_b"               \n\t"
1566
                "push %%"REG_BP"                        \n\t"
1567
                YSCALEYUV2RGB1(%%REGBP, %5)
1568
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1569
                "pop %%"REG_BP"                         \n\t"
1570
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1571

    
1572
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573
                "a" (&c->redDither)
1574
                );
1575
                return;
1576
            case PIX_FMT_RGB555:
1577
                asm volatile(
1578
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1579
                "mov        %4, %%"REG_b"               \n\t"
1580
                "push %%"REG_BP"                        \n\t"
1581
                YSCALEYUV2RGB1(%%REGBP, %5)
1582
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1583
#ifdef DITHER1XBPP
1584
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1585
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1586
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1587
#endif
1588
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1589
                "pop %%"REG_BP"                         \n\t"
1590
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1591

    
1592
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1593
                "a" (&c->redDither)
1594
                );
1595
                return;
1596
            case PIX_FMT_RGB565:
1597
                asm volatile(
1598
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1599
                "mov        %4, %%"REG_b"               \n\t"
1600
                "push %%"REG_BP"                        \n\t"
1601
                YSCALEYUV2RGB1(%%REGBP, %5)
1602
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1603
#ifdef DITHER1XBPP
1604
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1605
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1606
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1607
#endif
1608

    
1609
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1610
                "pop %%"REG_BP"                         \n\t"
1611
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1612

    
1613
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1614
                "a" (&c->redDither)
1615
                );
1616
                return;
1617
            case PIX_FMT_YUYV422:
1618
                asm volatile(
1619
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1620
                "mov        %4, %%"REG_b"               \n\t"
1621
                "push %%"REG_BP"                        \n\t"
1622
                YSCALEYUV2PACKED1(%%REGBP, %5)
1623
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1624
                "pop %%"REG_BP"                         \n\t"
1625
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1626

    
1627
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1628
                "a" (&c->redDither)
1629
                );
1630
                return;
1631
            }
1632
        }
1633
        else
1634
        {
1635
            switch(dstFormat)
1636
            {
1637
            case PIX_FMT_RGB32:
1638
                asm volatile(
1639
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1640
                "mov        %4, %%"REG_b"               \n\t"
1641
                "push %%"REG_BP"                        \n\t"
1642
                YSCALEYUV2RGB1b(%%REGBP, %5)
1643
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1644
                "pop %%"REG_BP"                         \n\t"
1645
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1646

    
1647
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1648
                "a" (&c->redDither)
1649
                );
1650
                return;
1651
            case PIX_FMT_BGR24:
1652
                asm volatile(
1653
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1654
                "mov        %4, %%"REG_b"               \n\t"
1655
                "push %%"REG_BP"                        \n\t"
1656
                YSCALEYUV2RGB1b(%%REGBP, %5)
1657
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1658
                "pop %%"REG_BP"                         \n\t"
1659
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1660

    
1661
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1662
                "a" (&c->redDither)
1663
                );
1664
                return;
1665
            case PIX_FMT_RGB555:
1666
                asm volatile(
1667
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1668
                "mov        %4, %%"REG_b"               \n\t"
1669
                "push %%"REG_BP"                        \n\t"
1670
                YSCALEYUV2RGB1b(%%REGBP, %5)
1671
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1672
#ifdef DITHER1XBPP
1673
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1674
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1675
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1676
#endif
1677
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1678
                "pop %%"REG_BP"                         \n\t"
1679
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1680

    
1681
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1682
                "a" (&c->redDither)
1683
                );
1684
                return;
1685
            case PIX_FMT_RGB565:
1686
                asm volatile(
1687
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1688
                "mov        %4, %%"REG_b"               \n\t"
1689
                "push %%"REG_BP"                        \n\t"
1690
                YSCALEYUV2RGB1b(%%REGBP, %5)
1691
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1692
#ifdef DITHER1XBPP
1693
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1694
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1695
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1696
#endif
1697

    
1698
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1699
                "pop %%"REG_BP"                         \n\t"
1700
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1701

    
1702
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1703
                "a" (&c->redDither)
1704
                );
1705
                return;
1706
            case PIX_FMT_YUYV422:
1707
                asm volatile(
1708
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1709
                "mov        %4, %%"REG_b"               \n\t"
1710
                "push %%"REG_BP"                        \n\t"
1711
                YSCALEYUV2PACKED1b(%%REGBP, %5)
1712
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1713
                "pop %%"REG_BP"                         \n\t"
1714
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1715

    
1716
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1717
                "a" (&c->redDither)
1718
                );
1719
                return;
1720
            }
1721
        }
1722
    }
1723
#endif /* HAVE_MMX */
1724
    if (uvalpha < 2048)
1725
    {
1726
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1727
    }else{
1728
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1729
    }
1730
}
1731

    
1732
//FIXME yuy2* can read up to 7 samples too much
1733

    
1734
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1735
{
1736
#ifdef HAVE_MMX
1737
    asm volatile(
1738
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1739
    "mov                    %0, %%"REG_a"       \n\t"
1740
    "1:                                         \n\t"
1741
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1742
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1743
    "pand                %%mm2, %%mm0           \n\t"
1744
    "pand                %%mm2, %%mm1           \n\t"
1745
    "packuswb            %%mm1, %%mm0           \n\t"
1746
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1747
    "add                    $8, %%"REG_a"       \n\t"
1748
    " js                    1b                  \n\t"
1749
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1750
    : "%"REG_a
1751
    );
1752
#else
1753
    int i;
1754
    for (i=0; i<width; i++)
1755
        dst[i]= src[2*i];
1756
#endif
1757
}
1758

    
1759
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1760
{
1761
#ifdef HAVE_MMX
1762
    asm volatile(
1763
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1764
    "mov                    %0, %%"REG_a"       \n\t"
1765
    "1:                                         \n\t"
1766
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1767
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1768
    "psrlw                  $8, %%mm0           \n\t"
1769
    "psrlw                  $8, %%mm1           \n\t"
1770
    "packuswb            %%mm1, %%mm0           \n\t"
1771
    "movq                %%mm0, %%mm1           \n\t"
1772
    "psrlw                  $8, %%mm0           \n\t"
1773
    "pand                %%mm4, %%mm1           \n\t"
1774
    "packuswb            %%mm0, %%mm0           \n\t"
1775
    "packuswb            %%mm1, %%mm1           \n\t"
1776
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1777
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1778
    "add                    $4, %%"REG_a"       \n\t"
1779
    " js                    1b                  \n\t"
1780
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1781
    : "%"REG_a
1782
    );
1783
#else
1784
    int i;
1785
    for (i=0; i<width; i++)
1786
    {
1787
        dstU[i]= src1[4*i + 1];
1788
        dstV[i]= src1[4*i + 3];
1789
    }
1790
#endif
1791
    assert(src1 == src2);
1792
}
1793

    
1794
/* This is almost identical to the previous, end exists only because
1795
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1796
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1797
{
1798
#ifdef HAVE_MMX
1799
    asm volatile(
1800
    "mov                  %0, %%"REG_a"         \n\t"
1801
    "1:                                         \n\t"
1802
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1803
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1804
    "psrlw                $8, %%mm0             \n\t"
1805
    "psrlw                $8, %%mm1             \n\t"
1806
    "packuswb          %%mm1, %%mm0             \n\t"
1807
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1808
    "add                  $8, %%"REG_a"         \n\t"
1809
    " js                  1b                    \n\t"
1810
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1811
    : "%"REG_a
1812
    );
1813
#else
1814
    int i;
1815
    for (i=0; i<width; i++)
1816
        dst[i]= src[2*i+1];
1817
#endif
1818
}
1819

    
1820
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1821
{
1822
#ifdef HAVE_MMX
1823
    asm volatile(
1824
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1825
    "mov                    %0, %%"REG_a"       \n\t"
1826
    "1:                                         \n\t"
1827
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1828
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1829
    "pand                %%mm4, %%mm0           \n\t"
1830
    "pand                %%mm4, %%mm1           \n\t"
1831
    "packuswb            %%mm1, %%mm0           \n\t"
1832
    "movq                %%mm0, %%mm1           \n\t"
1833
    "psrlw                  $8, %%mm0           \n\t"
1834
    "pand                %%mm4, %%mm1           \n\t"
1835
    "packuswb            %%mm0, %%mm0           \n\t"
1836
    "packuswb            %%mm1, %%mm1           \n\t"
1837
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1838
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1839
    "add                    $4, %%"REG_a"       \n\t"
1840
    " js                    1b                  \n\t"
1841
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1842
    : "%"REG_a
1843
    );
1844
#else
1845
    int i;
1846
    for (i=0; i<width; i++)
1847
    {
1848
        dstU[i]= src1[4*i + 0];
1849
        dstV[i]= src1[4*i + 2];
1850
    }
1851
#endif
1852
    assert(src1 == src2);
1853
}
1854

    
1855
#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1856
static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1857
{\
1858
    int i;\
1859
    for (i=0; i<width; i++)\
1860
    {\
1861
        int b= (((type*)src)[i]>>shb)&maskb;\
1862
        int g= (((type*)src)[i]>>shg)&maskg;\
1863
        int r= (((type*)src)[i]>>shr)&maskr;\
1864
\
1865
        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1866
    }\
1867
}
1868

    
1869
BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1870
BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1871
BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1872
BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1873
BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1874
BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1875

    
1876
#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1877
static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1878
{\
1879
    int i;\
1880
    for (i=0; i<width; i++)\
1881
    {\
1882
        int b= (((type*)src)[i]&maskb)>>shb;\
1883
        int g= (((type*)src)[i]&maskg)>>shg;\
1884
        int r= (((type*)src)[i]&maskr)>>shr;\
1885
\
1886
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1887
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1888
    }\
1889
}\
1890
static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1891
{\
1892
    int i;\
1893
    for (i=0; i<width; i++)\
1894
    {\
1895
        int pix0= ((type*)src)[2*i+0];\
1896
        int pix1= ((type*)src)[2*i+1];\
1897
        int g= (pix0&maskg)+(pix1&maskg);\
1898
        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1899
        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1900
\
1901
        g>>=shg;\
1902
\
1903
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1904
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1905
    }\
1906
}
1907

    
1908
BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1909
BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1910
BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1911
BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1912
BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1913
BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1914

    
1915
#ifdef HAVE_MMX
1916
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1917
{
1918

    
1919
    if(srcFormat == PIX_FMT_BGR24){
1920
        asm volatile(
1921
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1922
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1923
            :
1924
        );
1925
    }else{
1926
        asm volatile(
1927
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1928
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1929
            :
1930
        );
1931
    }
1932

    
1933
    asm volatile(
1934
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1935
        "mov                        %2, %%"REG_a"   \n\t"
1936
        "pxor                    %%mm7, %%mm7       \n\t"
1937
        "1:                                         \n\t"
1938
        PREFETCH"               64(%0)              \n\t"
1939
        "movd                     (%0), %%mm0       \n\t"
1940
        "movd                    2(%0), %%mm1       \n\t"
1941
        "movd                    6(%0), %%mm2       \n\t"
1942
        "movd                    8(%0), %%mm3       \n\t"
1943
        "add                       $12, %0          \n\t"
1944
        "punpcklbw               %%mm7, %%mm0       \n\t"
1945
        "punpcklbw               %%mm7, %%mm1       \n\t"
1946
        "punpcklbw               %%mm7, %%mm2       \n\t"
1947
        "punpcklbw               %%mm7, %%mm3       \n\t"
1948
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1949
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1950
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1951
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1952
        "paddd                   %%mm1, %%mm0       \n\t"
1953
        "paddd                   %%mm3, %%mm2       \n\t"
1954
        "paddd                   %%mm4, %%mm0       \n\t"
1955
        "paddd                   %%mm4, %%mm2       \n\t"
1956
        "psrad                     $15, %%mm0       \n\t"
1957
        "psrad                     $15, %%mm2       \n\t"
1958
        "packssdw                %%mm2, %%mm0       \n\t"
1959
        "packuswb                %%mm0, %%mm0       \n\t"
1960
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1961
        "add                        $4, %%"REG_a"   \n\t"
1962
        " js                        1b              \n\t"
1963
    : "+r" (src)
1964
    : "r" (dst+width), "g" (-width)
1965
    : "%"REG_a
1966
    );
1967
}
1968

    
1969
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1970
{
1971
    asm volatile(
1972
        "movq                    24+%4, %%mm6       \n\t"
1973
        "mov                        %3, %%"REG_a"   \n\t"
1974
        "pxor                    %%mm7, %%mm7       \n\t"
1975
        "1:                                         \n\t"
1976
        PREFETCH"               64(%0)              \n\t"
1977
        "movd                     (%0), %%mm0       \n\t"
1978
        "movd                    2(%0), %%mm1       \n\t"
1979
        "punpcklbw               %%mm7, %%mm0       \n\t"
1980
        "punpcklbw               %%mm7, %%mm1       \n\t"
1981
        "movq                    %%mm0, %%mm2       \n\t"
1982
        "movq                    %%mm1, %%mm3       \n\t"
1983
        "pmaddwd                    %4, %%mm0       \n\t"
1984
        "pmaddwd                  8+%4, %%mm1       \n\t"
1985
        "pmaddwd                 16+%4, %%mm2       \n\t"
1986
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1987
        "paddd                   %%mm1, %%mm0       \n\t"
1988
        "paddd                   %%mm3, %%mm2       \n\t"
1989

    
1990
        "movd                    6(%0), %%mm1       \n\t"
1991
        "movd                    8(%0), %%mm3       \n\t"
1992
        "add                       $12, %0          \n\t"
1993
        "punpcklbw               %%mm7, %%mm1       \n\t"
1994
        "punpcklbw               %%mm7, %%mm3       \n\t"
1995
        "movq                    %%mm1, %%mm4       \n\t"
1996
        "movq                    %%mm3, %%mm5       \n\t"
1997
        "pmaddwd                    %4, %%mm1       \n\t"
1998
        "pmaddwd                  8+%4, %%mm3       \n\t"
1999
        "pmaddwd                 16+%4, %%mm4       \n\t"
2000
        "pmaddwd                 %%mm6, %%mm5       \n\t"
2001
        "paddd                   %%mm3, %%mm1       \n\t"
2002
        "paddd                   %%mm5, %%mm4       \n\t"
2003

    
2004
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
2005
        "paddd                   %%mm3, %%mm0       \n\t"
2006
        "paddd                   %%mm3, %%mm2       \n\t"
2007
        "paddd                   %%mm3, %%mm1       \n\t"
2008
        "paddd                   %%mm3, %%mm4       \n\t"
2009
        "psrad                     $15, %%mm0       \n\t"
2010
        "psrad                     $15, %%mm2       \n\t"
2011
        "psrad                     $15, %%mm1       \n\t"
2012
        "psrad                     $15, %%mm4       \n\t"
2013
        "packssdw                %%mm1, %%mm0       \n\t"
2014
        "packssdw                %%mm4, %%mm2       \n\t"
2015
        "packuswb                %%mm0, %%mm0       \n\t"
2016
        "packuswb                %%mm2, %%mm2       \n\t"
2017
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
2018
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
2019
        "add                        $4, %%"REG_a"   \n\t"
2020
        " js                        1b              \n\t"
2021
    : "+r" (src)
2022
    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
2023
    : "%"REG_a
2024
    );
2025
}
2026
#endif
2027

    
2028
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2029
{
2030
#ifdef HAVE_MMX
2031
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
2032
#else
2033
    int i;
2034
    for (i=0; i<width; i++)
2035
    {
2036
        int b= src[i*3+0];
2037
        int g= src[i*3+1];
2038
        int r= src[i*3+2];
2039

    
2040
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2041
    }
2042
#endif /* HAVE_MMX */
2043
}
2044

    
2045
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2046
{
2047
#ifdef HAVE_MMX
2048
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
2049
#else
2050
    int i;
2051
    for (i=0; i<width; i++)
2052
    {
2053
        int b= src1[3*i + 0];
2054
        int g= src1[3*i + 1];
2055
        int r= src1[3*i + 2];
2056

    
2057
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2058
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2059
    }
2060
#endif /* HAVE_MMX */
2061
    assert(src1 == src2);
2062
}
2063

    
2064
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2065
{
2066
    int i;
2067
    for (i=0; i<width; i++)
2068
    {
2069
        int b= src1[6*i + 0] + src1[6*i + 3];
2070
        int g= src1[6*i + 1] + src1[6*i + 4];
2071
        int r= src1[6*i + 2] + src1[6*i + 5];
2072

    
2073
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2074
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2075
    }
2076
    assert(src1 == src2);
2077
}
2078

    
2079
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2080
{
2081
#ifdef HAVE_MMX
2082
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2083
#else
2084
    int i;
2085
    for (i=0; i<width; i++)
2086
    {
2087
        int r= src[i*3+0];
2088
        int g= src[i*3+1];
2089
        int b= src[i*3+2];
2090

    
2091
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2092
    }
2093
#endif
2094
}
2095

    
2096
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2097
{
2098
    int i;
2099
    assert(src1==src2);
2100
#ifdef HAVE_MMX
2101
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2102
#else
2103
    for (i=0; i<width; i++)
2104
    {
2105
        int r= src1[3*i + 0];
2106
        int g= src1[3*i + 1];
2107
        int b= src1[3*i + 2];
2108

    
2109
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2110
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2111
    }
2112
#endif
2113
}
2114

    
2115
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
2116
{
2117
    int i;
2118
    assert(src1==src2);
2119
    for (i=0; i<width; i++)
2120
    {
2121
        int r= src1[6*i + 0] + src1[6*i + 3];
2122
        int g= src1[6*i + 1] + src1[6*i + 4];
2123
        int b= src1[6*i + 2] + src1[6*i + 5];
2124

    
2125
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2126
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2127
    }
2128
}
2129

    
2130

    
2131
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
2132
{
2133
    int i;
2134
    for (i=0; i<width; i++)
2135
    {
2136
        int d= src[i];
2137

    
2138
        dst[i]= pal[d] & 0xFF;
2139
    }
2140
}
2141

    
2142
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
2143
{
2144
    int i;
2145
    assert(src1 == src2);
2146
    for (i=0; i<width; i++)
2147
    {
2148
        int p= pal[src1[i]];
2149

    
2150
        dstU[i]= p>>8;
2151
        dstV[i]= p>>16;
2152
    }
2153
}
2154

    
2155
static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2156
{
2157
    int i, j;
2158
    for (i=0; i<width/8; i++){
2159
        int d= ~src[i];
2160
        for(j=0; j<8; j++)
2161
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2162
    }
2163
}
2164

    
2165
static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
2166
{
2167
    int i, j;
2168
    for (i=0; i<width/8; i++){
2169
        int d= src[i];
2170
        for(j=0; j<8; j++)
2171
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2172
    }
2173
}
2174

    
2175
// bilinear / bicubic scaling
2176
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2177
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2178
{
2179
#ifdef HAVE_MMX
2180
    assert(filterSize % 4 == 0 && filterSize>0);
2181
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2182
    {
2183
        long counter= -2*dstW;
2184
        filter-= counter*2;
2185
        filterPos-= counter/2;
2186
        dst-= counter/2;
2187
        asm volatile(
2188
#if defined(PIC)
2189
        "push            %%"REG_b"              \n\t"
2190
#endif
2191
        "pxor                %%mm7, %%mm7       \n\t"
2192
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2193
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2194
        ASMALIGN(4)
2195
        "1:                                     \n\t"
2196
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2197
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2198
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2199
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2200
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2201
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2202
        "punpcklbw           %%mm7, %%mm0       \n\t"
2203
        "punpcklbw           %%mm7, %%mm2       \n\t"
2204
        "pmaddwd             %%mm1, %%mm0       \n\t"
2205
        "pmaddwd             %%mm2, %%mm3       \n\t"
2206
        "movq                %%mm0, %%mm4       \n\t"
2207
        "punpckldq           %%mm3, %%mm0       \n\t"
2208
        "punpckhdq           %%mm3, %%mm4       \n\t"
2209
        "paddd               %%mm4, %%mm0       \n\t"
2210
        "psrad                  $7, %%mm0       \n\t"
2211
        "packssdw            %%mm0, %%mm0       \n\t"
2212
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2213
        "add                    $4, %%"REG_BP"  \n\t"
2214
        " jnc                   1b              \n\t"
2215

    
2216
        "pop            %%"REG_BP"              \n\t"
2217
#if defined(PIC)
2218
        "pop             %%"REG_b"              \n\t"
2219
#endif
2220
        : "+a" (counter)
2221
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2222
#if !defined(PIC)
2223
        : "%"REG_b
2224
#endif
2225
        );
2226
    }
2227
    else if (filterSize==8)
2228
    {
2229
        long counter= -2*dstW;
2230
        filter-= counter*4;
2231
        filterPos-= counter/2;
2232
        dst-= counter/2;
2233
        asm volatile(
2234
#if defined(PIC)
2235
        "push             %%"REG_b"             \n\t"
2236
#endif
2237
        "pxor                 %%mm7, %%mm7      \n\t"
2238
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2239
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2240
        ASMALIGN(4)
2241
        "1:                                     \n\t"
2242
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2243
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2244
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2245
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2246
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2247
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2248
        "punpcklbw            %%mm7, %%mm0      \n\t"
2249
        "punpcklbw            %%mm7, %%mm2      \n\t"
2250
        "pmaddwd              %%mm1, %%mm0      \n\t"
2251
        "pmaddwd              %%mm2, %%mm3      \n\t"
2252

    
2253
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2254
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2255
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2256
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2257
        "punpcklbw            %%mm7, %%mm4      \n\t"
2258
        "punpcklbw            %%mm7, %%mm2      \n\t"
2259
        "pmaddwd              %%mm1, %%mm4      \n\t"
2260
        "pmaddwd              %%mm2, %%mm5      \n\t"
2261
        "paddd                %%mm4, %%mm0      \n\t"
2262
        "paddd                %%mm5, %%mm3      \n\t"
2263
        "movq                 %%mm0, %%mm4      \n\t"
2264
        "punpckldq            %%mm3, %%mm0      \n\t"
2265
        "punpckhdq            %%mm3, %%mm4      \n\t"
2266
        "paddd                %%mm4, %%mm0      \n\t"
2267
        "psrad                   $7, %%mm0      \n\t"
2268
        "packssdw             %%mm0, %%mm0      \n\t"
2269
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2270
        "add                     $4, %%"REG_BP" \n\t"
2271
        " jnc                    1b             \n\t"
2272

    
2273
        "pop             %%"REG_BP"             \n\t"
2274
#if defined(PIC)
2275
        "pop              %%"REG_b"             \n\t"
2276
#endif
2277
        : "+a" (counter)
2278
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2279
#if !defined(PIC)
2280
        : "%"REG_b
2281
#endif
2282
        );
2283
    }
2284
    else
2285
    {
2286
        uint8_t *offset = src+filterSize;
2287
        long counter= -2*dstW;
2288
        //filter-= counter*filterSize/2;
2289
        filterPos-= counter/2;
2290
        dst-= counter/2;
2291
        asm volatile(
2292
        "pxor                  %%mm7, %%mm7     \n\t"
2293
        ASMALIGN(4)
2294
        "1:                                     \n\t"
2295
        "mov                      %2, %%"REG_c" \n\t"
2296
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2297
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2298
        "mov                      %5, %%"REG_c" \n\t"
2299
        "pxor                  %%mm4, %%mm4     \n\t"
2300
        "pxor                  %%mm5, %%mm5     \n\t"
2301
        "2:                                     \n\t"
2302
        "movq                   (%1), %%mm1     \n\t"
2303
        "movq               (%1, %6), %%mm3     \n\t"
2304
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2305
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2306
        "punpcklbw             %%mm7, %%mm0     \n\t"
2307
        "punpcklbw             %%mm7, %%mm2     \n\t"
2308
        "pmaddwd               %%mm1, %%mm0     \n\t"
2309
        "pmaddwd               %%mm2, %%mm3     \n\t"
2310
        "paddd                 %%mm3, %%mm5     \n\t"
2311
        "paddd                 %%mm0, %%mm4     \n\t"
2312
        "add                      $8, %1        \n\t"
2313
        "add                      $4, %%"REG_c" \n\t"
2314
        "cmp                      %4, %%"REG_c" \n\t"
2315
        " jb                      2b            \n\t"
2316
        "add                      %6, %1        \n\t"
2317
        "movq                  %%mm4, %%mm0     \n\t"
2318
        "punpckldq             %%mm5, %%mm4     \n\t"
2319
        "punpckhdq             %%mm5, %%mm0     \n\t"
2320
        "paddd                 %%mm0, %%mm4     \n\t"
2321
        "psrad                    $7, %%mm4     \n\t"
2322
        "packssdw              %%mm4, %%mm4     \n\t"
2323
        "mov                      %3, %%"REG_a" \n\t"
2324
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2325
        "add                      $4, %0        \n\t"
2326
        " jnc                     1b            \n\t"
2327

    
2328
        : "+r" (counter), "+r" (filter)
2329
        : "m" (filterPos), "m" (dst), "m"(offset),
2330
          "m" (src), "r" (filterSize*2)
2331
        : "%"REG_a, "%"REG_c, "%"REG_d
2332
        );
2333
    }
2334
#else
2335
#ifdef HAVE_ALTIVEC
2336
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2337
#else
2338
    int i;
2339
    for (i=0; i<dstW; i++)
2340
    {
2341
        int j;
2342
        int srcPos= filterPos[i];
2343
        int val=0;
2344
        //printf("filterPos: %d\n", filterPos[i]);
2345
        for (j=0; j<filterSize; j++)
2346
        {
2347
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2348
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2349
        }
2350
        //filter += hFilterSize;
2351
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2352
        //dst[i] = val>>7;
2353
    }
2354
#endif /* HAVE_ALTIVEC */
2355
#endif /* HAVE_MMX */
2356
}
2357
      // *** horizontal scale Y line to temp buffer
2358
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2359
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2360
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2361
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2362
                                   int32_t *mmx2FilterPos, uint32_t *pal)
2363
{
2364
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2365
    {
2366
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2367
        src= formatConvBuffer;
2368
    }
2369
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2370
    {
2371
        RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2372
        src= formatConvBuffer;
2373
    }
2374
    else if (srcFormat==PIX_FMT_RGB32)
2375
    {
2376
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2377
        src= formatConvBuffer;
2378
    }
2379
    else if (srcFormat==PIX_FMT_RGB32_1)
2380
    {
2381
        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2382
        src= formatConvBuffer;
2383
    }
2384
    else if (srcFormat==PIX_FMT_BGR24)
2385
    {
2386
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2387
        src= formatConvBuffer;
2388
    }
2389
    else if (srcFormat==PIX_FMT_BGR565)
2390
    {
2391
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2392
        src= formatConvBuffer;
2393
    }
2394
    else if (srcFormat==PIX_FMT_BGR555)
2395
    {
2396
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2397
        src= formatConvBuffer;
2398
    }
2399
    else if (srcFormat==PIX_FMT_BGR32)
2400
    {
2401
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2402
        src= formatConvBuffer;
2403
    }
2404
    else if (srcFormat==PIX_FMT_BGR32_1)
2405
    {
2406
        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2407
        src= formatConvBuffer;
2408
    }
2409
    else if (srcFormat==PIX_FMT_RGB24)
2410
    {
2411
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2412
        src= formatConvBuffer;
2413
    }
2414
    else if (srcFormat==PIX_FMT_RGB565)
2415
    {
2416
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2417
        src= formatConvBuffer;
2418
    }
2419
    else if (srcFormat==PIX_FMT_RGB555)
2420
    {
2421
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2422
        src= formatConvBuffer;
2423
    }
2424
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2425
    {
2426
        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2427
        src= formatConvBuffer;
2428
    }
2429
    else if (srcFormat==PIX_FMT_MONOBLACK)
2430
    {
2431
        RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2432
        src= formatConvBuffer;
2433
    }
2434
    else if (srcFormat==PIX_FMT_MONOWHITE)
2435
    {
2436
        RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2437
        src= formatConvBuffer;
2438
    }
2439

    
2440
#ifdef HAVE_MMX
2441
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2442
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2443
#else
2444
    if (!(flags&SWS_FAST_BILINEAR))
2445
#endif
2446
    {
2447
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2448
    }
2449
    else // fast bilinear upscale / crap downscale
2450
    {
2451
#if defined(ARCH_X86)
2452
#ifdef HAVE_MMX2
2453
        int i;
2454
#if defined(PIC)
2455
        uint64_t ebxsave __attribute__((aligned(8)));
2456
#endif
2457
        if (canMMX2BeUsed)
2458
        {
2459
            asm volatile(
2460
#if defined(PIC)
2461
            "mov               %%"REG_b", %5        \n\t"
2462
#endif
2463
            "pxor                  %%mm7, %%mm7     \n\t"
2464
            "mov                      %0, %%"REG_c" \n\t"
2465
            "mov                      %1, %%"REG_D" \n\t"
2466
            "mov                      %2, %%"REG_d" \n\t"
2467
            "mov                      %3, %%"REG_b" \n\t"
2468
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2469
            PREFETCH"        (%%"REG_c")            \n\t"
2470
            PREFETCH"      32(%%"REG_c")            \n\t"
2471
            PREFETCH"      64(%%"REG_c")            \n\t"
2472

    
2473
#ifdef ARCH_X86_64
2474

    
2475
#define FUNNY_Y_CODE \
2476
            "movl            (%%"REG_b"), %%esi     \n\t"\
2477
            "call                    *%4            \n\t"\
2478
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2479
            "add               %%"REG_S", %%"REG_c" \n\t"\
2480
            "add               %%"REG_a", %%"REG_D" \n\t"\
2481
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2482

    
2483
#else
2484

    
2485
#define FUNNY_Y_CODE \
2486
            "movl (%%"REG_b"), %%esi        \n\t"\
2487
            "call         *%4                       \n\t"\
2488
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2489
            "add               %%"REG_a", %%"REG_D" \n\t"\
2490
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2491

    
2492
#endif /* ARCH_X86_64 */
2493

    
2494
FUNNY_Y_CODE
2495
FUNNY_Y_CODE
2496
FUNNY_Y_CODE
2497
FUNNY_Y_CODE
2498
FUNNY_Y_CODE
2499
FUNNY_Y_CODE
2500
FUNNY_Y_CODE
2501
FUNNY_Y_CODE
2502

    
2503
#if defined(PIC)
2504
            "mov                      %5, %%"REG_b" \n\t"
2505
#endif
2506
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2507
            "m" (funnyYCode)
2508
#if defined(PIC)
2509
            ,"m" (ebxsave)
2510
#endif
2511
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2512
#if !defined(PIC)
2513
            ,"%"REG_b
2514
#endif
2515
            );
2516
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2517
        }
2518
        else
2519
        {
2520
#endif /* HAVE_MMX2 */
2521
        long xInc_shr16 = xInc >> 16;
2522
        uint16_t xInc_mask = xInc & 0xffff;
2523
        //NO MMX just normal asm ...
2524
        asm volatile(
2525
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2526
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2527
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2528
        ASMALIGN(4)
2529
        "1:                                  \n\t"
2530
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2531
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2532
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2533
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2534
        "shll      $16, %%edi                \n\t"
2535
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2536
        "mov        %1, %%"REG_D"            \n\t"
2537
        "shrl       $9, %%esi                \n\t"
2538
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2539
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2540
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2541

    
2542
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2543
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2544
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2545
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2546
        "shll      $16, %%edi                \n\t"
2547
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2548
        "mov        %1, %%"REG_D"            \n\t"
2549
        "shrl       $9, %%esi                \n\t"
2550
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2551
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2552
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2553

    
2554

    
2555
        "add        $2, %%"REG_a"            \n\t"
2556
        "cmp        %2, %%"REG_a"            \n\t"
2557
        " jb        1b                       \n\t"
2558

    
2559

    
2560
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2561
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2562
        );
2563
#ifdef HAVE_MMX2
2564
        } //if MMX2 can't be used
2565
#endif
2566
#else
2567
        int i;
2568
        unsigned int xpos=0;
2569
        for (i=0;i<dstWidth;i++)
2570
        {
2571
            register unsigned int xx=xpos>>16;
2572
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2573
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2574
            xpos+=xInc;
2575
        }
2576
#endif /* defined(ARCH_X86) */
2577
    }
2578

    
2579
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2580
        int i;
2581
        //FIXME all pal and rgb srcFormats could do this convertion as well
2582
        //FIXME all scalers more complex than bilinear could do half of this transform
2583
        if(c->srcRange){
2584
            for (i=0; i<dstWidth; i++)
2585
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2586
        }else{
2587
            for (i=0; i<dstWidth; i++)
2588
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2589
        }
2590
    }
2591
}
2592

    
2593
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2594
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2595
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2596
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2597
                                   int32_t *mmx2FilterPos, uint32_t *pal)
2598
{
2599
    if (srcFormat==PIX_FMT_YUYV422)
2600
    {
2601
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2602
        src1= formatConvBuffer;
2603
        src2= formatConvBuffer+VOFW;
2604
    }
2605
    else if (srcFormat==PIX_FMT_UYVY422)
2606
    {
2607
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2608
        src1= formatConvBuffer;
2609
        src2= formatConvBuffer+VOFW;
2610
    }
2611
    else if (srcFormat==PIX_FMT_RGB32)
2612
    {
2613
        if(c->chrSrcHSubSample)
2614
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2615
        else
2616
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2617
        src1= formatConvBuffer;
2618
        src2= formatConvBuffer+VOFW;
2619
    }
2620
    else if (srcFormat==PIX_FMT_RGB32_1)
2621
    {
2622
        if(c->chrSrcHSubSample)
2623
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2624
        else
2625
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2626
        src1= formatConvBuffer;
2627
        src2= formatConvBuffer+VOFW;
2628
    }
2629
    else if (srcFormat==PIX_FMT_BGR24)
2630
    {
2631
        if(c->chrSrcHSubSample)
2632
            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2633
        else
2634
            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2635
        src1= formatConvBuffer;
2636
        src2= formatConvBuffer+VOFW;
2637
    }
2638
    else if (srcFormat==PIX_FMT_BGR565)
2639
    {
2640
        if(c->chrSrcHSubSample)
2641
            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2642
        else
2643
            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2644
        src1= formatConvBuffer;
2645
        src2= formatConvBuffer+VOFW;
2646
    }
2647
    else if (srcFormat==PIX_FMT_BGR555)
2648
    {
2649
        if(c->chrSrcHSubSample)
2650
            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2651
        else
2652
            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2653
        src1= formatConvBuffer;
2654
        src2= formatConvBuffer+VOFW;
2655
    }
2656
    else if (srcFormat==PIX_FMT_BGR32)
2657
    {
2658
        if(c->chrSrcHSubSample)
2659
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2660
        else
2661
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2662
        src1= formatConvBuffer;
2663
        src2= formatConvBuffer+VOFW;
2664
    }
2665
    else if (srcFormat==PIX_FMT_BGR32_1)
2666
    {
2667
        if(c->chrSrcHSubSample)
2668
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2669
        else
2670
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2671
        src1= formatConvBuffer;
2672
        src2= formatConvBuffer+VOFW;
2673
    }
2674
    else if (srcFormat==PIX_FMT_RGB24)
2675
    {
2676
        if(c->chrSrcHSubSample)
2677
            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2678
        else
2679
            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2680
        src1= formatConvBuffer;
2681
        src2= formatConvBuffer+VOFW;
2682
    }
2683
    else if (srcFormat==PIX_FMT_RGB565)
2684
    {
2685
        if(c->chrSrcHSubSample)
2686
            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2687
        else
2688
            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2689
        src1= formatConvBuffer;
2690
        src2= formatConvBuffer+VOFW;
2691
    }
2692
    else if (srcFormat==PIX_FMT_RGB555)
2693
    {
2694
        if(c->chrSrcHSubSample)
2695
            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2696
        else
2697
            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2698
        src1= formatConvBuffer;
2699
        src2= formatConvBuffer+VOFW;
2700
    }
2701
    else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2702
    {
2703
        return;
2704
    }
2705
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2706
    {
2707
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2708
        src1= formatConvBuffer;
2709
        src2= formatConvBuffer+VOFW;
2710
    }
2711

    
2712
#ifdef HAVE_MMX
2713
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2714
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2715
#else
2716
    if (!(flags&SWS_FAST_BILINEAR))
2717
#endif
2718
    {
2719
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2720
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2721
    }
2722
    else // fast bilinear upscale / crap downscale
2723
    {
2724
#if defined(ARCH_X86)
2725
#ifdef HAVE_MMX2
2726
        int i;
2727
#if defined(PIC)
2728
        uint64_t ebxsave __attribute__((aligned(8)));
2729
#endif
2730
        if (canMMX2BeUsed)
2731
        {
2732
            asm volatile(
2733
#if defined(PIC)
2734
            "mov          %%"REG_b", %6         \n\t"
2735
#endif
2736
            "pxor             %%mm7, %%mm7      \n\t"
2737
            "mov                 %0, %%"REG_c"  \n\t"
2738
            "mov                 %1, %%"REG_D"  \n\t"
2739
            "mov                 %2, %%"REG_d"  \n\t"
2740
            "mov                 %3, %%"REG_b"  \n\t"
2741
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2742
            PREFETCH"   (%%"REG_c")             \n\t"
2743
            PREFETCH" 32(%%"REG_c")             \n\t"
2744
            PREFETCH" 64(%%"REG_c")             \n\t"
2745

    
2746
#ifdef ARCH_X86_64
2747

    
2748
#define FUNNY_UV_CODE \
2749
            "movl       (%%"REG_b"), %%esi      \n\t"\
2750
            "call               *%4             \n\t"\
2751
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2752
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2753
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2754
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2755

    
2756
#else
2757

    
2758
#define FUNNY_UV_CODE \
2759
            "movl       (%%"REG_b"), %%esi      \n\t"\
2760
            "call               *%4             \n\t"\
2761
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2762
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2763
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2764

    
2765
#endif /* ARCH_X86_64 */
2766

    
2767
FUNNY_UV_CODE
2768
FUNNY_UV_CODE
2769
FUNNY_UV_CODE
2770
FUNNY_UV_CODE
2771
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2772
            "mov                 %5, %%"REG_c"  \n\t" // src
2773
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2774
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2775
            PREFETCH"   (%%"REG_c")             \n\t"
2776
            PREFETCH" 32(%%"REG_c")             \n\t"
2777
            PREFETCH" 64(%%"REG_c")             \n\t"
2778

    
2779
FUNNY_UV_CODE
2780
FUNNY_UV_CODE
2781
FUNNY_UV_CODE
2782
FUNNY_UV_CODE
2783

    
2784
#if defined(PIC)
2785
            "mov %6, %%"REG_b"    \n\t"
2786
#endif
2787
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2788
            "m" (funnyUVCode), "m" (src2)
2789
#if defined(PIC)
2790
            ,"m" (ebxsave)
2791
#endif
2792
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2793
#if !defined(PIC)
2794
             ,"%"REG_b
2795
#endif
2796
            );
2797
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2798
            {
2799
                //printf("%d %d %d\n", dstWidth, i, srcW);
2800
                dst[i] = src1[srcW-1]*128;
2801
                dst[i+VOFW] = src2[srcW-1]*128;
2802
            }
2803
        }
2804
        else
2805
        {
2806
#endif /* HAVE_MMX2 */
2807
            long xInc_shr16 = (long) (xInc >> 16);
2808
            uint16_t xInc_mask = xInc & 0xffff;
2809
            asm volatile(
2810
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2811
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2812
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2813
            ASMALIGN(4)
2814
            "1:                                     \n\t"
2815
            "mov        %0, %%"REG_S"               \n\t"
2816
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2817
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2818
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2819
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2820
            "shll      $16, %%edi                   \n\t"
2821
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2822
            "mov        %1, %%"REG_D"               \n\t"
2823
            "shrl       $9, %%esi                   \n\t"
2824
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2825

    
2826
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2827
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2828
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2829
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2830
            "shll      $16, %%edi                   \n\t"
2831
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2832
            "mov        %1, %%"REG_D"               \n\t"
2833
            "shrl       $9, %%esi                   \n\t"
2834
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2835

    
2836
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2837
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2838
            "add        $1, %%"REG_a"               \n\t"
2839
            "cmp        %2, %%"REG_a"               \n\t"
2840
            " jb        1b                          \n\t"
2841

    
2842
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2843
   which is needed to support GCC 4.0. */
2844
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2845
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2846
#else
2847
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2848
#endif
2849
            "r" (src2)
2850
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2851
            );
2852
#ifdef HAVE_MMX2
2853
        } //if MMX2 can't be used
2854
#endif
2855
#else
2856
        int i;
2857
        unsigned int xpos=0;
2858
        for (i=0;i<dstWidth;i++)
2859
        {
2860
            register unsigned int xx=xpos>>16;
2861
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2862
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2863
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2864
            /* slower
2865
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2866
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2867
            */
2868
            xpos+=xInc;
2869
        }
2870
#endif /* defined(ARCH_X86) */
2871
    }
2872
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2873
        int i;
2874
        //FIXME all pal and rgb srcFormats could do this convertion as well
2875
        //FIXME all scalers more complex than bilinear could do half of this transform
2876
        if(c->srcRange){
2877
            for (i=0; i<dstWidth; i++){
2878
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2879
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2880
            }
2881
        }else{
2882
            for (i=0; i<dstWidth; i++){
2883
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2884
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2885
            }
2886
        }
2887
    }
2888
}
2889

    
2890
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2891
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2892

    
2893
    /* load a few things into local vars to make the code more readable? and faster */
2894
    const int srcW= c->srcW;
2895
    const int dstW= c->dstW;
2896
    const int dstH= c->dstH;
2897
    const int chrDstW= c->chrDstW;
2898
    const int chrSrcW= c->chrSrcW;
2899
    const int lumXInc= c->lumXInc;
2900
    const int chrXInc= c->chrXInc;
2901
    const int dstFormat= c->dstFormat;
2902
    const int srcFormat= c->srcFormat;
2903
    const int flags= c->flags;
2904
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2905
    int16_t *vLumFilterPos= c->vLumFilterPos;
2906
    int16_t *vChrFilterPos= c->vChrFilterPos;
2907
    int16_t *hLumFilterPos= c->hLumFilterPos;
2908
    int16_t *hChrFilterPos= c->hChrFilterPos;
2909
    int16_t *vLumFilter= c->vLumFilter;
2910
    int16_t *vChrFilter= c->vChrFilter;
2911
    int16_t *hLumFilter= c->hLumFilter;
2912
    int16_t *hChrFilter= c->hChrFilter;
2913
    int32_t *lumMmxFilter= c->lumMmxFilter;
2914
    int32_t *chrMmxFilter= c->chrMmxFilter;
2915
    const int vLumFilterSize= c->vLumFilterSize;
2916
    const int vChrFilterSize= c->vChrFilterSize;
2917
    const int hLumFilterSize= c->hLumFilterSize;
2918
    const int hChrFilterSize= c->hChrFilterSize;
2919
    int16_t **lumPixBuf= c->lumPixBuf;
2920
    int16_t **chrPixBuf= c->chrPixBuf;
2921
    const int vLumBufSize= c->vLumBufSize;
2922
    const int vChrBufSize= c->vChrBufSize;
2923
    uint8_t *funnyYCode= c->funnyYCode;
2924
    uint8_t *funnyUVCode= c->funnyUVCode;
2925
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2926
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2927
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2928
    int lastDstY;
2929
    uint32_t *pal=NULL;
2930

    
2931
    /* vars which will change and which we need to store back in the context */
2932
    int dstY= c->dstY;
2933
    int lumBufIndex= c->lumBufIndex;
2934
    int chrBufIndex= c->chrBufIndex;
2935
    int lastInLumBuf= c->lastInLumBuf;
2936
    int lastInChrBuf= c->lastInChrBuf;
2937

    
2938
    if (isPacked(c->srcFormat)){
2939
        pal= (uint32_t *)src[1];
2940
        src[0]=
2941
        src[1]=
2942
        src[2]= src[0];
2943
        srcStride[0]=
2944
        srcStride[1]=
2945
        srcStride[2]= srcStride[0];
2946
    }
2947
    srcStride[1]<<= c->vChrDrop;
2948
    srcStride[2]<<= c->vChrDrop;
2949

    
2950
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2951
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2952

    
2953
#if 0 //self test FIXME move to a vfilter or something
2954
    {
2955
    static volatile int i=0;
2956
    i++;
2957
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2958
        selfTest(src, srcStride, c->srcW, c->srcH);
2959
    i--;
2960
    }
2961
#endif
2962

    
2963
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2964
    //dstStride[0],dstStride[1],dstStride[2]);
2965

    
2966
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2967
    {
2968
        static int firstTime=1; //FIXME move this into the context perhaps
2969
        if (flags & SWS_PRINT_INFO && firstTime)
2970
        {
2971
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2972
                   "         ->cannot do aligned memory accesses anymore\n");
2973
            firstTime=0;
2974
        }
2975
    }
2976

    
2977
    /* Note the user might start scaling the picture in the middle so this
2978
       will not get executed. This is not really intended but works
2979
       currently, so people might do it. */
2980
    if (srcSliceY ==0){
2981
        lumBufIndex=0;
2982
        chrBufIndex=0;
2983
        dstY=0;
2984
        lastInLumBuf= -1;
2985
        lastInChrBuf= -1;
2986
    }
2987

    
2988
    lastDstY= dstY;
2989

    
2990
    for (;dstY < dstH; dstY++){
2991
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2992
        const int chrDstY= dstY>>c->chrDstVSubSample;
2993
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2994
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2995

    
2996
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2997
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2998
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2999
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3000

    
3001
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3002
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3003
        //handle holes (FAST_BILINEAR & weird filters)
3004
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3005
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3006
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3007
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3008
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3009

    
3010
        // Do we have enough lines in this slice to output the dstY line
3011
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3012
        {
3013
            //Do horizontal scaling
3014
            while(lastInLumBuf < lastLumSrcY)
3015
            {
3016
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3017
                lumBufIndex++;
3018
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3019
                assert(lumBufIndex < 2*vLumBufSize);
3020
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3021
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3022
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3023
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3024
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3025
                                funnyYCode, c->srcFormat, formatConvBuffer,
3026
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3027
                lastInLumBuf++;
3028
            }
3029
            while(lastInChrBuf < lastChrSrcY)
3030
            {
3031
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3032
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3033
                chrBufIndex++;
3034
                assert(chrBufIndex < 2*vChrBufSize);
3035
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3036
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3037
                //FIXME replace parameters through context struct (some at least)
3038

    
3039
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3040
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3041
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3042
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3043
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3044
                lastInChrBuf++;
3045
            }
3046
            //wrap buf index around to stay inside the ring buffer
3047
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3048
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3049
        }
3050
        else // not enough lines left in this slice -> load the rest in the buffer
3051
        {
3052
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3053
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3054
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3055
            vChrBufSize, vLumBufSize);*/
3056

    
3057
            //Do horizontal scaling
3058
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3059
            {
3060
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3061
                lumBufIndex++;
3062
                assert(lumBufIndex < 2*vLumBufSize);
3063
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3064
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3065
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3066
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3067
                                funnyYCode, c->srcFormat, formatConvBuffer,
3068
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3069
                lastInLumBuf++;
3070
            }
3071
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3072
            {
3073
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3074
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3075
                chrBufIndex++;
3076
                assert(chrBufIndex < 2*vChrBufSize);
3077
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3078
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3079

    
3080
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3081
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3082
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3083
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3084
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3085
                lastInChrBuf++;
3086
            }
3087
            //wrap buf index around to stay inside the ring buffer
3088
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3089
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3090
            break; //we can't output a dstY line so let's try with the next slice
3091
        }
3092

    
3093
#ifdef HAVE_MMX
3094
        b5Dither= ff_dither8[dstY&1];
3095
        g6Dither= ff_dither4[dstY&1];
3096
        g5Dither= ff_dither8[dstY&1];
3097
        r5Dither= ff_dither8[(dstY+1)&1];
3098
#endif
3099
        if (dstY < dstH-2)
3100
        {
3101
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3102
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3103
#ifdef HAVE_MMX
3104
            int i;
3105
        if (flags & SWS_ACCURATE_RND){
3106
            int s= APCK_SIZE / 8;
3107
            for (i=0; i<vLumFilterSize; i+=2){
3108
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
3109
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
3110
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
3111
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
3112
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3113
            }
3114
            for (i=0; i<vChrFilterSize; i+=2){
3115
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
3116
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
3117
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
3118
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3119
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3120
            }
3121
        }else{
3122
            for (i=0; i<vLumFilterSize; i++)
3123
            {
3124
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3125
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3126
                lumMmxFilter[4*i+2]=
3127
                lumMmxFilter[4*i+3]=
3128
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3129
            }
3130
            for (i=0; i<vChrFilterSize; i++)
3131
            {
3132
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3133
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3134
                chrMmxFilter[4*i+2]=
3135
                chrMmxFilter[4*i+3]=
3136
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3137
            }
3138
        }
3139
#endif
3140
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3141
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3142
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3143
                RENAME(yuv2nv12X)(c,
3144
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3145
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3146
                    dest, uDest, dstW, chrDstW, dstFormat);
3147
            }
3148
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
3149
            {
3150
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3151
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3152
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3153
                {
3154
                    int16_t *lumBuf = lumPixBuf[0];
3155
                    int16_t *chrBuf= chrPixBuf[0];
3156
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3157
                }
3158
                else //General YV12
3159
                {
3160
                    RENAME(yuv2yuvX)(c,
3161
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3162
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3163
                        dest, uDest, vDest, dstW, chrDstW);
3164
                }
3165
            }
3166
            else
3167
            {
3168
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3169
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3170
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3171
                {
3172
                    int chrAlpha= vChrFilter[2*dstY+1];
3173
                    if(flags & SWS_FULL_CHR_H_INT){
3174
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
3175
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3176
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3177
                            dest, dstW, dstY);
3178
                    }else{
3179
                        RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3180
                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
3181
                    }
3182
                }
3183
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3184
                {
3185
                    int lumAlpha= vLumFilter[2*dstY+1];
3186
                    int chrAlpha= vChrFilter[2*dstY+1];
3187
                    lumMmxFilter[2]=
3188
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3189
                    chrMmxFilter[2]=
3190
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3191
                    if(flags & SWS_FULL_CHR_H_INT){
3192
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
3193
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3194
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3195
                            dest, dstW, dstY);
3196
                    }else{
3197
                        RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3198
                            dest, dstW, lumAlpha, chrAlpha, dstY);
3199
                    }
3200
                }
3201
                else //general RGB
3202
                {
3203
                    if(flags & SWS_FULL_CHR_H_INT){
3204
                        yuv2rgbXinC_full(c,
3205
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3206
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3207
                            dest, dstW, dstY);
3208
                    }else{
3209
                        RENAME(yuv2packedX)(c,
3210
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3211
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3212
                            dest, dstW, dstY);
3213
                    }
3214
                }
3215
            }
3216
        }
3217
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3218
        {
3219
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3220
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3221
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3222
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3223
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3224
                yuv2nv12XinC(
3225
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3226
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3227
                    dest, uDest, dstW, chrDstW, dstFormat);
3228
            }
3229
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3230
            {
3231
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3232
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3233
                yuv2yuvXinC(
3234
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3235
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3236
                    dest, uDest, vDest, dstW, chrDstW);
3237
            }
3238
            else
3239
            {
3240
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3241
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3242
                if(flags & SWS_FULL_CHR_H_INT){
3243
                    yuv2rgbXinC_full(c,
3244
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3245
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3246
                        dest, dstW, dstY);
3247
                }else{
3248
                    yuv2packedXinC(c,
3249
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3250
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3251
                        dest, dstW, dstY);
3252
                }
3253
            }
3254
        }
3255
    }
3256

    
3257
#ifdef HAVE_MMX
3258
    asm volatile(SFENCE:::"memory");
3259
    asm volatile(EMMS:::"memory");
3260
#endif
3261
    /* store changed local vars back in the context */
3262
    c->dstY= dstY;
3263
    c->lumBufIndex= lumBufIndex;
3264
    c->chrBufIndex= chrBufIndex;
3265
    c->lastInLumBuf= lastInLumBuf;
3266
    c->lastInChrBuf= lastInChrBuf;
3267

    
3268
    return dstY - lastDstY;
3269
}