Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 12794f73

History | View | Annotate | Download (126 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#if HAVE_AMD3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#if HAVE_AMD3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif HAVE_MMX2
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#if HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#if HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif HAVE_AMD3DNOW
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#if HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#if HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    __asm__ volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    __asm__ volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddsw             %%mm7, %%mm0      \n\t"\
194
    "paddsw             %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX \
210
    __asm__ volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232
\
233
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236
    "movq                      %%mm1, %%mm7         \n\t"\
237
    ASMALIGN(4)\
238
    "2:                                             \n\t"\
239
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242
    "add                         $16, %%"REG_d"            \n\t"\
243
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244
    "pmulhw                    %%mm0, %%mm2         \n\t"\
245
    "pmulhw                    %%mm0, %%mm5         \n\t"\
246
    "paddw                     %%mm2, %%mm1         \n\t"\
247
    "paddw                     %%mm5, %%mm7         \n\t"\
248
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
249
    " jnz                         2b                \n\t"\
250

    
251
#define YSCALEYUV2PACKEDX_END                 \
252
    :: "r" (&c->redDither),                   \
253
        "m" (dummy), "m" (dummy), "m" (dummy),\
254
        "r" (dest), "m" (dstW)                \
255
    : "%"REG_a, "%"REG_d, "%"REG_S            \
256
    );
257

    
258
#define YSCALEYUV2PACKEDX_ACCURATE \
259
    __asm__ volatile(\
260
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
261
    ASMALIGN(4)\
262
    "nop                                            \n\t"\
263
    "1:                                             \n\t"\
264
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266
    "pxor                      %%mm4, %%mm4         \n\t"\
267
    "pxor                      %%mm5, %%mm5         \n\t"\
268
    "pxor                      %%mm6, %%mm6         \n\t"\
269
    "pxor                      %%mm7, %%mm7         \n\t"\
270
    ASMALIGN(4)\
271
    "2:                                             \n\t"\
272
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
275
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276
    "movq                      %%mm0, %%mm3         \n\t"\
277
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
278
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
279
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
280
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
282
    "paddd                     %%mm0, %%mm4         \n\t"\
283
    "paddd                     %%mm3, %%mm5         \n\t"\
284
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
286
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
287
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
288
    "movq                      %%mm2, %%mm0         \n\t"\
289
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
290
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
291
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
292
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
293
    "paddd                     %%mm2, %%mm6         \n\t"\
294
    "paddd                     %%mm0, %%mm7         \n\t"\
295
    " jnz                         2b                \n\t"\
296
    "psrad                       $16, %%mm4         \n\t"\
297
    "psrad                       $16, %%mm5         \n\t"\
298
    "psrad                       $16, %%mm6         \n\t"\
299
    "psrad                       $16, %%mm7         \n\t"\
300
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301
    "packssdw                  %%mm5, %%mm4         \n\t"\
302
    "packssdw                  %%mm7, %%mm6         \n\t"\
303
    "paddw                     %%mm0, %%mm4         \n\t"\
304
    "paddw                     %%mm0, %%mm6         \n\t"\
305
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307
\
308
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310
    "pxor                      %%mm1, %%mm1         \n\t"\
311
    "pxor                      %%mm5, %%mm5         \n\t"\
312
    "pxor                      %%mm7, %%mm7         \n\t"\
313
    "pxor                      %%mm6, %%mm6         \n\t"\
314
    ASMALIGN(4)\
315
    "2:                                             \n\t"\
316
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
319
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320
    "movq                      %%mm0, %%mm3         \n\t"\
321
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
322
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
323
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
325
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
326
    "paddd                     %%mm0, %%mm1         \n\t"\
327
    "paddd                     %%mm3, %%mm5         \n\t"\
328
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
330
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
331
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
332
    "movq                      %%mm2, %%mm0         \n\t"\
333
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
334
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
335
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
336
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
337
    "paddd                     %%mm2, %%mm7         \n\t"\
338
    "paddd                     %%mm0, %%mm6         \n\t"\
339
    " jnz                         2b                \n\t"\
340
    "psrad                       $16, %%mm1         \n\t"\
341
    "psrad                       $16, %%mm5         \n\t"\
342
    "psrad                       $16, %%mm7         \n\t"\
343
    "psrad                       $16, %%mm6         \n\t"\
344
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345
    "packssdw                  %%mm5, %%mm1         \n\t"\
346
    "packssdw                  %%mm6, %%mm7         \n\t"\
347
    "paddw                     %%mm0, %%mm1         \n\t"\
348
    "paddw                     %%mm0, %%mm7         \n\t"\
349
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351

    
352
#define YSCALEYUV2RGBX \
353
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367
    "paddw           %%mm3, %%mm4       \n\t"\
368
    "movq            %%mm2, %%mm0       \n\t"\
369
    "movq            %%mm5, %%mm6       \n\t"\
370
    "movq            %%mm4, %%mm3       \n\t"\
371
    "punpcklwd       %%mm2, %%mm2       \n\t"\
372
    "punpcklwd       %%mm5, %%mm5       \n\t"\
373
    "punpcklwd       %%mm4, %%mm4       \n\t"\
374
    "paddw           %%mm1, %%mm2       \n\t"\
375
    "paddw           %%mm1, %%mm5       \n\t"\
376
    "paddw           %%mm1, %%mm4       \n\t"\
377
    "punpckhwd       %%mm0, %%mm0       \n\t"\
378
    "punpckhwd       %%mm6, %%mm6       \n\t"\
379
    "punpckhwd       %%mm3, %%mm3       \n\t"\
380
    "paddw           %%mm7, %%mm0       \n\t"\
381
    "paddw           %%mm7, %%mm6       \n\t"\
382
    "paddw           %%mm7, %%mm3       \n\t"\
383
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384
    "packuswb        %%mm0, %%mm2       \n\t"\
385
    "packuswb        %%mm6, %%mm5       \n\t"\
386
    "packuswb        %%mm3, %%mm4       \n\t"\
387
    "pxor            %%mm7, %%mm7       \n\t"
388

    
389
#define REAL_YSCALEYUV2PACKED(index, c) \
390
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
391
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
392
    "psraw                $3, %%mm0                           \n\t"\
393
    "psraw                $3, %%mm1                           \n\t"\
394
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
395
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
396
    "xor            "#index", "#index"                        \n\t"\
397
    ASMALIGN(4)\
398
    "1:                                 \n\t"\
399
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
400
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
401
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
402
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
403
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
404
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
405
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
406
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
407
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
408
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
409
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
410
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
411
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
412
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
413
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
414
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
415
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
416
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
417
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
418
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
419
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
420
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
421
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
422
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
423
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
424

    
425
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
426

    
427
#define REAL_YSCALEYUV2RGB(index, c) \
428
    "xor            "#index", "#index"  \n\t"\
429
    ASMALIGN(4)\
430
    "1:                                 \n\t"\
431
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
432
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
433
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
434
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
435
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
436
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
437
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
438
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
439
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
440
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
441
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
442
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
443
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
444
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
445
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
446
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
447
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
448
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
449
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
450
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
451
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
452
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
453
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
454
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
455
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
456
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
457
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
458
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
459
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
460
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
461
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
462
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
463
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
464
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
465
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
466
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
467
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
468
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
469
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
470
    "paddw             %%mm3, %%mm4     \n\t"\
471
    "movq              %%mm2, %%mm0     \n\t"\
472
    "movq              %%mm5, %%mm6     \n\t"\
473
    "movq              %%mm4, %%mm3     \n\t"\
474
    "punpcklwd         %%mm2, %%mm2     \n\t"\
475
    "punpcklwd         %%mm5, %%mm5     \n\t"\
476
    "punpcklwd         %%mm4, %%mm4     \n\t"\
477
    "paddw             %%mm1, %%mm2     \n\t"\
478
    "paddw             %%mm1, %%mm5     \n\t"\
479
    "paddw             %%mm1, %%mm4     \n\t"\
480
    "punpckhwd         %%mm0, %%mm0     \n\t"\
481
    "punpckhwd         %%mm6, %%mm6     \n\t"\
482
    "punpckhwd         %%mm3, %%mm3     \n\t"\
483
    "paddw             %%mm7, %%mm0     \n\t"\
484
    "paddw             %%mm7, %%mm6     \n\t"\
485
    "paddw             %%mm7, %%mm3     \n\t"\
486
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
487
    "packuswb          %%mm0, %%mm2     \n\t"\
488
    "packuswb          %%mm6, %%mm5     \n\t"\
489
    "packuswb          %%mm3, %%mm4     \n\t"\
490
    "pxor              %%mm7, %%mm7     \n\t"
491
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
492

    
493
#define REAL_YSCALEYUV2PACKED1(index, c) \
494
    "xor            "#index", "#index"  \n\t"\
495
    ASMALIGN(4)\
496
    "1:                                 \n\t"\
497
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
498
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
499
    "psraw                $7, %%mm3     \n\t" \
500
    "psraw                $7, %%mm4     \n\t" \
501
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
502
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
503
    "psraw                $7, %%mm1     \n\t" \
504
    "psraw                $7, %%mm7     \n\t" \
505

    
506
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
507

    
508
#define REAL_YSCALEYUV2RGB1(index, c) \
509
    "xor            "#index", "#index"  \n\t"\
510
    ASMALIGN(4)\
511
    "1:                                 \n\t"\
512
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
513
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
514
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
515
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
516
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
517
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
518
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
519
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
520
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
521
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
522
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
523
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
524
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
525
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
527
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
528
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
529
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
530
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
531
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
532
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
533
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
534
    "paddw             %%mm3, %%mm4     \n\t"\
535
    "movq              %%mm2, %%mm0     \n\t"\
536
    "movq              %%mm5, %%mm6     \n\t"\
537
    "movq              %%mm4, %%mm3     \n\t"\
538
    "punpcklwd         %%mm2, %%mm2     \n\t"\
539
    "punpcklwd         %%mm5, %%mm5     \n\t"\
540
    "punpcklwd         %%mm4, %%mm4     \n\t"\
541
    "paddw             %%mm1, %%mm2     \n\t"\
542
    "paddw             %%mm1, %%mm5     \n\t"\
543
    "paddw             %%mm1, %%mm4     \n\t"\
544
    "punpckhwd         %%mm0, %%mm0     \n\t"\
545
    "punpckhwd         %%mm6, %%mm6     \n\t"\
546
    "punpckhwd         %%mm3, %%mm3     \n\t"\
547
    "paddw             %%mm7, %%mm0     \n\t"\
548
    "paddw             %%mm7, %%mm6     \n\t"\
549
    "paddw             %%mm7, %%mm3     \n\t"\
550
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
551
    "packuswb          %%mm0, %%mm2     \n\t"\
552
    "packuswb          %%mm6, %%mm5     \n\t"\
553
    "packuswb          %%mm3, %%mm4     \n\t"\
554
    "pxor              %%mm7, %%mm7     \n\t"
555
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
556

    
557
#define REAL_YSCALEYUV2PACKED1b(index, c) \
558
    "xor "#index", "#index"             \n\t"\
559
    ASMALIGN(4)\
560
    "1:                                 \n\t"\
561
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
562
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
563
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
564
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
565
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
566
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
567
    "psrlw                $8, %%mm3     \n\t" \
568
    "psrlw                $8, %%mm4     \n\t" \
569
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
570
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
571
    "psraw                $7, %%mm1     \n\t" \
572
    "psraw                $7, %%mm7     \n\t"
573
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
574

    
575
// do vertical chrominance interpolation
576
#define REAL_YSCALEYUV2RGB1b(index, c) \
577
    "xor            "#index", "#index"  \n\t"\
578
    ASMALIGN(4)\
579
    "1:                                 \n\t"\
580
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
581
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
582
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
583
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
584
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
585
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
586
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
587
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
588
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
589
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
590
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
591
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
592
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
593
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
594
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
595
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
596
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
597
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
599
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
600
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
601
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
602
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
603
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
604
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
605
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
606
    "paddw             %%mm3, %%mm4     \n\t"\
607
    "movq              %%mm2, %%mm0     \n\t"\
608
    "movq              %%mm5, %%mm6     \n\t"\
609
    "movq              %%mm4, %%mm3     \n\t"\
610
    "punpcklwd         %%mm2, %%mm2     \n\t"\
611
    "punpcklwd         %%mm5, %%mm5     \n\t"\
612
    "punpcklwd         %%mm4, %%mm4     \n\t"\
613
    "paddw             %%mm1, %%mm2     \n\t"\
614
    "paddw             %%mm1, %%mm5     \n\t"\
615
    "paddw             %%mm1, %%mm4     \n\t"\
616
    "punpckhwd         %%mm0, %%mm0     \n\t"\
617
    "punpckhwd         %%mm6, %%mm6     \n\t"\
618
    "punpckhwd         %%mm3, %%mm3     \n\t"\
619
    "paddw             %%mm7, %%mm0     \n\t"\
620
    "paddw             %%mm7, %%mm6     \n\t"\
621
    "paddw             %%mm7, %%mm3     \n\t"\
622
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
623
    "packuswb          %%mm0, %%mm2     \n\t"\
624
    "packuswb          %%mm6, %%mm5     \n\t"\
625
    "packuswb          %%mm3, %%mm4     \n\t"\
626
    "pxor              %%mm7, %%mm7     \n\t"
627
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
628

    
629
#define REAL_WRITEBGR32(dst, dstw, index) \
630
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
631
    "movq      %%mm2, %%mm1     \n\t" /* B */\
632
    "movq      %%mm5, %%mm6     \n\t" /* R */\
633
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
634
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
635
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
636
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
637
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
638
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
639
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
640
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
641
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
642
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
643
\
644
    MOVNTQ(%%mm0,   (dst, index, 4))\
645
    MOVNTQ(%%mm2,  8(dst, index, 4))\
646
    MOVNTQ(%%mm1, 16(dst, index, 4))\
647
    MOVNTQ(%%mm3, 24(dst, index, 4))\
648
\
649
    "add      $8, "#index"      \n\t"\
650
    "cmp "#dstw", "#index"      \n\t"\
651
    " jb      1b                \n\t"
652
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
653

    
654
#define REAL_WRITERGB16(dst, dstw, index) \
655
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
656
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
657
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
658
    "psrlq           $3, %%mm2  \n\t"\
659
\
660
    "movq         %%mm2, %%mm1  \n\t"\
661
    "movq         %%mm4, %%mm3  \n\t"\
662
\
663
    "punpcklbw    %%mm7, %%mm3  \n\t"\
664
    "punpcklbw    %%mm5, %%mm2  \n\t"\
665
    "punpckhbw    %%mm7, %%mm4  \n\t"\
666
    "punpckhbw    %%mm5, %%mm1  \n\t"\
667
\
668
    "psllq           $3, %%mm3  \n\t"\
669
    "psllq           $3, %%mm4  \n\t"\
670
\
671
    "por          %%mm3, %%mm2  \n\t"\
672
    "por          %%mm4, %%mm1  \n\t"\
673
\
674
    MOVNTQ(%%mm2,  (dst, index, 2))\
675
    MOVNTQ(%%mm1, 8(dst, index, 2))\
676
\
677
    "add             $8, "#index"   \n\t"\
678
    "cmp        "#dstw", "#index"   \n\t"\
679
    " jb             1b             \n\t"
680
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
681

    
682
#define REAL_WRITERGB15(dst, dstw, index) \
683
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
684
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
685
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
686
    "psrlq           $3, %%mm2  \n\t"\
687
    "psrlq           $1, %%mm5  \n\t"\
688
\
689
    "movq         %%mm2, %%mm1  \n\t"\
690
    "movq         %%mm4, %%mm3  \n\t"\
691
\
692
    "punpcklbw    %%mm7, %%mm3  \n\t"\
693
    "punpcklbw    %%mm5, %%mm2  \n\t"\
694
    "punpckhbw    %%mm7, %%mm4  \n\t"\
695
    "punpckhbw    %%mm5, %%mm1  \n\t"\
696
\
697
    "psllq           $2, %%mm3  \n\t"\
698
    "psllq           $2, %%mm4  \n\t"\
699
\
700
    "por          %%mm3, %%mm2  \n\t"\
701
    "por          %%mm4, %%mm1  \n\t"\
702
\
703
    MOVNTQ(%%mm2,  (dst, index, 2))\
704
    MOVNTQ(%%mm1, 8(dst, index, 2))\
705
\
706
    "add             $8, "#index"   \n\t"\
707
    "cmp        "#dstw", "#index"   \n\t"\
708
    " jb             1b             \n\t"
709
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
710

    
711
#define WRITEBGR24OLD(dst, dstw, index) \
712
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
713
    "movq      %%mm2, %%mm1             \n\t" /* B */\
714
    "movq      %%mm5, %%mm6             \n\t" /* R */\
715
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
716
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
717
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
718
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
719
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
720
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
721
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
722
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
723
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
724
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
725
\
726
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
727
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
728
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
729
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
730
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
731
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
732
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
733
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
734
\
735
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
736
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
737
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
738
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
739
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
740
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
741
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
742
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
743
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
744
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
745
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
746
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
747
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
748
\
749
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
750
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
751
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
752
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
753
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
754
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
755
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
756
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
757
\
758
    MOVNTQ(%%mm0,   (dst))\
759
    MOVNTQ(%%mm2,  8(dst))\
760
    MOVNTQ(%%mm3, 16(dst))\
761
    "add         $24, "#dst"            \n\t"\
762
\
763
    "add          $8, "#index"          \n\t"\
764
    "cmp     "#dstw", "#index"          \n\t"\
765
    " jb          1b                    \n\t"
766

    
767
#define WRITEBGR24MMX(dst, dstw, index) \
768
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
769
    "movq      %%mm2, %%mm1     \n\t" /* B */\
770
    "movq      %%mm5, %%mm6     \n\t" /* R */\
771
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
772
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
773
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
774
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
775
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
776
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
777
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
778
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
779
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
780
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
781
\
782
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
783
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
784
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
785
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
786
\
787
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
788
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
789
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
790
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
791
\
792
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
793
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
794
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
795
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
796
\
797
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
798
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
799
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
800
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
801
    MOVNTQ(%%mm0, (dst))\
802
\
803
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
804
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
805
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
806
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
807
    MOVNTQ(%%mm6, 8(dst))\
808
\
809
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
810
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
811
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
812
    MOVNTQ(%%mm5, 16(dst))\
813
\
814
    "add         $24, "#dst"    \n\t"\
815
\
816
    "add          $8, "#index"  \n\t"\
817
    "cmp     "#dstw", "#index"  \n\t"\
818
    " jb          1b            \n\t"
819

    
820
#define WRITEBGR24MMX2(dst, dstw, index) \
821
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
822
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
823
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
824
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
825
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
826
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
827
\
828
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
829
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
830
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
831
\
832
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
833
    "por    %%mm1, %%mm6        \n\t"\
834
    "por    %%mm3, %%mm6        \n\t"\
835
    MOVNTQ(%%mm6, (dst))\
836
\
837
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
838
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
839
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
840
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
841
\
842
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
843
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
844
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
845
\
846
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
847
    "por    %%mm3, %%mm6        \n\t"\
848
    MOVNTQ(%%mm6, 8(dst))\
849
\
850
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
851
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
852
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
853
\
854
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
855
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
856
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
857
\
858
    "por    %%mm1, %%mm3        \n\t"\
859
    "por    %%mm3, %%mm6        \n\t"\
860
    MOVNTQ(%%mm6, 16(dst))\
861
\
862
    "add      $24, "#dst"       \n\t"\
863
\
864
    "add       $8, "#index"     \n\t"\
865
    "cmp  "#dstw", "#index"     \n\t"\
866
    " jb       1b               \n\t"
867

    
868
#if HAVE_MMX2
869
#undef WRITEBGR24
870
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
871
#else
872
#undef WRITEBGR24
873
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
874
#endif
875

    
876
#define REAL_WRITEYUY2(dst, dstw, index) \
877
    "packuswb  %%mm3, %%mm3     \n\t"\
878
    "packuswb  %%mm4, %%mm4     \n\t"\
879
    "packuswb  %%mm7, %%mm1     \n\t"\
880
    "punpcklbw %%mm4, %%mm3     \n\t"\
881
    "movq      %%mm1, %%mm7     \n\t"\
882
    "punpcklbw %%mm3, %%mm1     \n\t"\
883
    "punpckhbw %%mm3, %%mm7     \n\t"\
884
\
885
    MOVNTQ(%%mm1, (dst, index, 2))\
886
    MOVNTQ(%%mm7, 8(dst, index, 2))\
887
\
888
    "add          $8, "#index"  \n\t"\
889
    "cmp     "#dstw", "#index"  \n\t"\
890
    " jb          1b            \n\t"
891
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
892

    
893

    
894
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
895
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
896
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
897
{
898
#if HAVE_MMX
899
    if(!(c->flags & SWS_BITEXACT)){
900
        if (c->flags & SWS_ACCURATE_RND){
901
            if (uDest){
902
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
903
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
904
            }
905

    
906
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
907
        }else{
908
            if (uDest){
909
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
910
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
911
            }
912

    
913
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
914
        }
915
        return;
916
    }
917
#endif
918
#if HAVE_ALTIVEC
919
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
920
                      chrFilter, chrSrc, chrFilterSize,
921
                      dest, uDest, vDest, dstW, chrDstW);
922
#else //HAVE_ALTIVEC
923
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
924
            chrFilter, chrSrc, chrFilterSize,
925
            dest, uDest, vDest, dstW, chrDstW);
926
#endif //!HAVE_ALTIVEC
927
}
928

    
929
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
930
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
931
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
932
{
933
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
934
             chrFilter, chrSrc, chrFilterSize,
935
             dest, uDest, dstW, chrDstW, dstFormat);
936
}
937

    
938
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
939
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
940
{
941
    int i;
942
#if HAVE_MMX
943
    if(!(c->flags & SWS_BITEXACT)){
944
        long p= uDest ? 3 : 1;
945
        uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
946
        uint8_t *dst[3]= {dest, uDest, vDest};
947
        long counter[3] = {dstW, chrDstW, chrDstW};
948

    
949
        if (c->flags & SWS_ACCURATE_RND){
950
            while(p--){
951
                __asm__ volatile(
952
                    YSCALEYUV2YV121_ACCURATE
953
                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
954
                    "g" (-counter[p])
955
                    : "%"REG_a
956
                );
957
            }
958
        }else{
959
            while(p--){
960
                __asm__ volatile(
961
                    YSCALEYUV2YV121
962
                    :: "r" (src[p]), "r" (dst[p] + counter[p]),
963
                    "g" (-counter[p])
964
                    : "%"REG_a
965
                );
966
            }
967
        }
968
        return;
969
    }
970
#endif
971
    for (i=0; i<dstW; i++)
972
    {
973
        int val= (lumSrc[i]+64)>>7;
974

    
975
        if (val&256){
976
            if (val<0) val=0;
977
            else       val=255;
978
        }
979

    
980
        dest[i]= val;
981
    }
982

    
983
    if (uDest)
984
        for (i=0; i<chrDstW; i++)
985
        {
986
            int u=(chrSrc[i       ]+64)>>7;
987
            int v=(chrSrc[i + VOFW]+64)>>7;
988

    
989
            if ((u|v)&256){
990
                if (u<0)        u=0;
991
                else if (u>255) u=255;
992
                if (v<0)        v=0;
993
                else if (v>255) v=255;
994
            }
995

    
996
            uDest[i]= u;
997
            vDest[i]= v;
998
        }
999
}
1000

    
1001

    
1002
/**
1003
 * vertical scale YV12 to RGB
1004
 */
1005
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1006
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1007
                                       uint8_t *dest, long dstW, long dstY)
1008
{
1009
#if HAVE_MMX
1010
    long dummy=0;
1011
    if(!(c->flags & SWS_BITEXACT)){
1012
        if (c->flags & SWS_ACCURATE_RND){
1013
            switch(c->dstFormat){
1014
            case PIX_FMT_RGB32:
1015
                YSCALEYUV2PACKEDX_ACCURATE
1016
                YSCALEYUV2RGBX
1017
                WRITEBGR32(%4, %5, %%REGa)
1018

    
1019
                YSCALEYUV2PACKEDX_END
1020
                return;
1021
            case PIX_FMT_BGR24:
1022
                YSCALEYUV2PACKEDX_ACCURATE
1023
                YSCALEYUV2RGBX
1024
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1025
                "add %4, %%"REG_c"                        \n\t"
1026
                WRITEBGR24(%%REGc, %5, %%REGa)
1027

    
1028

    
1029
                :: "r" (&c->redDither),
1030
                "m" (dummy), "m" (dummy), "m" (dummy),
1031
                "r" (dest), "m" (dstW)
1032
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1033
                );
1034
                return;
1035
            case PIX_FMT_RGB555:
1036
                YSCALEYUV2PACKEDX_ACCURATE
1037
                YSCALEYUV2RGBX
1038
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1039
#ifdef DITHER1XBPP
1040
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1041
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1042
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1043
#endif
1044

    
1045
                WRITERGB15(%4, %5, %%REGa)
1046
                YSCALEYUV2PACKEDX_END
1047
                return;
1048
            case PIX_FMT_RGB565:
1049
                YSCALEYUV2PACKEDX_ACCURATE
1050
                YSCALEYUV2RGBX
1051
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1052
#ifdef DITHER1XBPP
1053
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1054
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1055
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1056
#endif
1057

    
1058
                WRITERGB16(%4, %5, %%REGa)
1059
                YSCALEYUV2PACKEDX_END
1060
                return;
1061
            case PIX_FMT_YUYV422:
1062
                YSCALEYUV2PACKEDX_ACCURATE
1063
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1064

    
1065
                "psraw $3, %%mm3    \n\t"
1066
                "psraw $3, %%mm4    \n\t"
1067
                "psraw $3, %%mm1    \n\t"
1068
                "psraw $3, %%mm7    \n\t"
1069
                WRITEYUY2(%4, %5, %%REGa)
1070
                YSCALEYUV2PACKEDX_END
1071
                return;
1072
            }
1073
        }else{
1074
            switch(c->dstFormat)
1075
            {
1076
            case PIX_FMT_RGB32:
1077
                YSCALEYUV2PACKEDX
1078
                YSCALEYUV2RGBX
1079
                WRITEBGR32(%4, %5, %%REGa)
1080
                YSCALEYUV2PACKEDX_END
1081
                return;
1082
            case PIX_FMT_BGR24:
1083
                YSCALEYUV2PACKEDX
1084
                YSCALEYUV2RGBX
1085
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1086
                "add                        %4, %%"REG_c"   \n\t"
1087
                WRITEBGR24(%%REGc, %5, %%REGa)
1088

    
1089
                :: "r" (&c->redDither),
1090
                "m" (dummy), "m" (dummy), "m" (dummy),
1091
                "r" (dest),  "m" (dstW)
1092
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1093
                );
1094
                return;
1095
            case PIX_FMT_RGB555:
1096
                YSCALEYUV2PACKEDX
1097
                YSCALEYUV2RGBX
1098
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1099
#ifdef DITHER1XBPP
1100
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1101
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1102
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1103
#endif
1104

    
1105
                WRITERGB15(%4, %5, %%REGa)
1106
                YSCALEYUV2PACKEDX_END
1107
                return;
1108
            case PIX_FMT_RGB565:
1109
                YSCALEYUV2PACKEDX
1110
                YSCALEYUV2RGBX
1111
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1112
#ifdef DITHER1XBPP
1113
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1114
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1115
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1116
#endif
1117

    
1118
                WRITERGB16(%4, %5, %%REGa)
1119
                YSCALEYUV2PACKEDX_END
1120
                return;
1121
            case PIX_FMT_YUYV422:
1122
                YSCALEYUV2PACKEDX
1123
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1124

    
1125
                "psraw $3, %%mm3    \n\t"
1126
                "psraw $3, %%mm4    \n\t"
1127
                "psraw $3, %%mm1    \n\t"
1128
                "psraw $3, %%mm7    \n\t"
1129
                WRITEYUY2(%4, %5, %%REGa)
1130
                YSCALEYUV2PACKEDX_END
1131
                return;
1132
            }
1133
        }
1134
    }
1135
#endif /* HAVE_MMX */
1136
#if HAVE_ALTIVEC
1137
    /* The following list of supported dstFormat values should
1138
       match what's found in the body of altivec_yuv2packedX() */
1139
    if (!(c->flags & SWS_BITEXACT) &&
1140
       (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1141
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1142
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1143
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1144
                                 chrFilter, chrSrc, chrFilterSize,
1145
                                 dest, dstW, dstY);
1146
    else
1147
#endif
1148
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1149
                       chrFilter, chrSrc, chrFilterSize,
1150
                       dest, dstW, dstY);
1151
}
1152

    
1153
/**
1154
 * vertical bilinear scale YV12 to RGB
1155
 */
1156
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1157
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1158
{
1159
    int  yalpha1=4095- yalpha;
1160
    int uvalpha1=4095-uvalpha;
1161
    int i;
1162

    
1163
#if HAVE_MMX
1164
    if(!(c->flags & SWS_BITEXACT)){
1165
        switch(c->dstFormat)
1166
        {
1167
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1168
            case PIX_FMT_RGB32:
1169
                __asm__ volatile(
1170
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1171
                "mov        %4, %%"REG_b"               \n\t"
1172
                "push %%"REG_BP"                        \n\t"
1173
                YSCALEYUV2RGB(%%REGBP, %5)
1174
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1175
                "pop %%"REG_BP"                         \n\t"
1176
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1177

    
1178
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1179
                "a" (&c->redDither)
1180
                );
1181
                return;
1182
            case PIX_FMT_BGR24:
1183
                __asm__ volatile(
1184
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1185
                "mov        %4, %%"REG_b"               \n\t"
1186
                "push %%"REG_BP"                        \n\t"
1187
                YSCALEYUV2RGB(%%REGBP, %5)
1188
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1189
                "pop %%"REG_BP"                         \n\t"
1190
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1191
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1192
                "a" (&c->redDither)
1193
                );
1194
                return;
1195
            case PIX_FMT_RGB555:
1196
                __asm__ volatile(
1197
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1198
                "mov        %4, %%"REG_b"               \n\t"
1199
                "push %%"REG_BP"                        \n\t"
1200
                YSCALEYUV2RGB(%%REGBP, %5)
1201
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1202
#ifdef DITHER1XBPP
1203
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1204
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1205
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1206
#endif
1207

    
1208
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1209
                "pop %%"REG_BP"                         \n\t"
1210
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1211

    
1212
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1213
                "a" (&c->redDither)
1214
                );
1215
                return;
1216
            case PIX_FMT_RGB565:
1217
                __asm__ volatile(
1218
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1219
                "mov        %4, %%"REG_b"               \n\t"
1220
                "push %%"REG_BP"                        \n\t"
1221
                YSCALEYUV2RGB(%%REGBP, %5)
1222
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1223
#ifdef DITHER1XBPP
1224
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1225
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1226
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1227
#endif
1228

    
1229
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1230
                "pop %%"REG_BP"                         \n\t"
1231
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1232
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1233
                "a" (&c->redDither)
1234
                );
1235
                return;
1236
            case PIX_FMT_YUYV422:
1237
                __asm__ volatile(
1238
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1239
                "mov %4, %%"REG_b"                        \n\t"
1240
                "push %%"REG_BP"                        \n\t"
1241
                YSCALEYUV2PACKED(%%REGBP, %5)
1242
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1243
                "pop %%"REG_BP"                         \n\t"
1244
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1245
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1246
                "a" (&c->redDither)
1247
                );
1248
                return;
1249
            default: break;
1250
        }
1251
    }
1252
#endif //HAVE_MMX
1253
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C, YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1254
}
1255

    
1256
/**
1257
 * YV12 to RGB without scaling or interpolating
1258
 */
1259
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1260
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1261
{
1262
    const int yalpha1=0;
1263
    int i;
1264

    
1265
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1266
    const int yalpha= 4096; //FIXME ...
1267

    
1268
    if (flags&SWS_FULL_CHR_H_INT)
1269
    {
1270
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1271
        return;
1272
    }
1273

    
1274
#if HAVE_MMX
1275
    if(!(flags & SWS_BITEXACT)){
1276
        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1277
        {
1278
            switch(dstFormat)
1279
            {
1280
            case PIX_FMT_RGB32:
1281
                __asm__ volatile(
1282
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1283
                "mov        %4, %%"REG_b"               \n\t"
1284
                "push %%"REG_BP"                        \n\t"
1285
                YSCALEYUV2RGB1(%%REGBP, %5)
1286
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1287
                "pop %%"REG_BP"                         \n\t"
1288
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1289

    
1290
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1291
                "a" (&c->redDither)
1292
                );
1293
                return;
1294
            case PIX_FMT_BGR24:
1295
                __asm__ volatile(
1296
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1297
                "mov        %4, %%"REG_b"               \n\t"
1298
                "push %%"REG_BP"                        \n\t"
1299
                YSCALEYUV2RGB1(%%REGBP, %5)
1300
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1301
                "pop %%"REG_BP"                         \n\t"
1302
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1303

    
1304
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1305
                "a" (&c->redDither)
1306
                );
1307
                return;
1308
            case PIX_FMT_RGB555:
1309
                __asm__ volatile(
1310
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1311
                "mov        %4, %%"REG_b"               \n\t"
1312
                "push %%"REG_BP"                        \n\t"
1313
                YSCALEYUV2RGB1(%%REGBP, %5)
1314
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1315
#ifdef DITHER1XBPP
1316
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1317
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1318
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1319
#endif
1320
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1321
                "pop %%"REG_BP"                         \n\t"
1322
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1323

    
1324
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1325
                "a" (&c->redDither)
1326
                );
1327
                return;
1328
            case PIX_FMT_RGB565:
1329
                __asm__ volatile(
1330
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1331
                "mov        %4, %%"REG_b"               \n\t"
1332
                "push %%"REG_BP"                        \n\t"
1333
                YSCALEYUV2RGB1(%%REGBP, %5)
1334
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1335
#ifdef DITHER1XBPP
1336
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1337
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1338
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1339
#endif
1340

    
1341
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1342
                "pop %%"REG_BP"                         \n\t"
1343
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1344

    
1345
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346
                "a" (&c->redDither)
1347
                );
1348
                return;
1349
            case PIX_FMT_YUYV422:
1350
                __asm__ volatile(
1351
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1352
                "mov        %4, %%"REG_b"               \n\t"
1353
                "push %%"REG_BP"                        \n\t"
1354
                YSCALEYUV2PACKED1(%%REGBP, %5)
1355
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1356
                "pop %%"REG_BP"                         \n\t"
1357
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1358

    
1359
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1360
                "a" (&c->redDither)
1361
                );
1362
                return;
1363
            }
1364
        }
1365
        else
1366
        {
1367
            switch(dstFormat)
1368
            {
1369
            case PIX_FMT_RGB32:
1370
                __asm__ volatile(
1371
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1372
                "mov        %4, %%"REG_b"               \n\t"
1373
                "push %%"REG_BP"                        \n\t"
1374
                YSCALEYUV2RGB1b(%%REGBP, %5)
1375
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1376
                "pop %%"REG_BP"                         \n\t"
1377
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1378

    
1379
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1380
                "a" (&c->redDither)
1381
                );
1382
                return;
1383
            case PIX_FMT_BGR24:
1384
                __asm__ volatile(
1385
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1386
                "mov        %4, %%"REG_b"               \n\t"
1387
                "push %%"REG_BP"                        \n\t"
1388
                YSCALEYUV2RGB1b(%%REGBP, %5)
1389
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1390
                "pop %%"REG_BP"                         \n\t"
1391
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1392

    
1393
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394
                "a" (&c->redDither)
1395
                );
1396
                return;
1397
            case PIX_FMT_RGB555:
1398
                __asm__ volatile(
1399
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1400
                "mov        %4, %%"REG_b"               \n\t"
1401
                "push %%"REG_BP"                        \n\t"
1402
                YSCALEYUV2RGB1b(%%REGBP, %5)
1403
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1404
#ifdef DITHER1XBPP
1405
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1406
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1407
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1408
#endif
1409
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1410
                "pop %%"REG_BP"                         \n\t"
1411
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1412

    
1413
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1414
                "a" (&c->redDither)
1415
                );
1416
                return;
1417
            case PIX_FMT_RGB565:
1418
                __asm__ volatile(
1419
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1420
                "mov        %4, %%"REG_b"               \n\t"
1421
                "push %%"REG_BP"                        \n\t"
1422
                YSCALEYUV2RGB1b(%%REGBP, %5)
1423
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1424
#ifdef DITHER1XBPP
1425
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1426
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1427
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1428
#endif
1429

    
1430
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1431
                "pop %%"REG_BP"                         \n\t"
1432
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1433

    
1434
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1435
                "a" (&c->redDither)
1436
                );
1437
                return;
1438
            case PIX_FMT_YUYV422:
1439
                __asm__ volatile(
1440
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1441
                "mov        %4, %%"REG_b"               \n\t"
1442
                "push %%"REG_BP"                        \n\t"
1443
                YSCALEYUV2PACKED1b(%%REGBP, %5)
1444
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1445
                "pop %%"REG_BP"                         \n\t"
1446
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1447

    
1448
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1449
                "a" (&c->redDither)
1450
                );
1451
                return;
1452
            }
1453
        }
1454
    }
1455
#endif /* HAVE_MMX */
1456
    if (uvalpha < 2048)
1457
    {
1458
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1459
    }else{
1460
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C, YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1461
    }
1462
}
1463

    
1464
//FIXME yuy2* can read up to 7 samples too much
1465

    
1466
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1467
{
1468
#if HAVE_MMX
1469
    __asm__ volatile(
1470
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1471
    "mov                    %0, %%"REG_a"       \n\t"
1472
    "1:                                         \n\t"
1473
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1474
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1475
    "pand                %%mm2, %%mm0           \n\t"
1476
    "pand                %%mm2, %%mm1           \n\t"
1477
    "packuswb            %%mm1, %%mm0           \n\t"
1478
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1479
    "add                    $8, %%"REG_a"       \n\t"
1480
    " js                    1b                  \n\t"
1481
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1482
    : "%"REG_a
1483
    );
1484
#else
1485
    int i;
1486
    for (i=0; i<width; i++)
1487
        dst[i]= src[2*i];
1488
#endif
1489
}
1490

    
1491
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1492
{
1493
#if HAVE_MMX
1494
    __asm__ volatile(
1495
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1496
    "mov                    %0, %%"REG_a"       \n\t"
1497
    "1:                                         \n\t"
1498
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1499
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1500
    "psrlw                  $8, %%mm0           \n\t"
1501
    "psrlw                  $8, %%mm1           \n\t"
1502
    "packuswb            %%mm1, %%mm0           \n\t"
1503
    "movq                %%mm0, %%mm1           \n\t"
1504
    "psrlw                  $8, %%mm0           \n\t"
1505
    "pand                %%mm4, %%mm1           \n\t"
1506
    "packuswb            %%mm0, %%mm0           \n\t"
1507
    "packuswb            %%mm1, %%mm1           \n\t"
1508
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1509
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1510
    "add                    $4, %%"REG_a"       \n\t"
1511
    " js                    1b                  \n\t"
1512
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1513
    : "%"REG_a
1514
    );
1515
#else
1516
    int i;
1517
    for (i=0; i<width; i++)
1518
    {
1519
        dstU[i]= src1[4*i + 1];
1520
        dstV[i]= src1[4*i + 3];
1521
    }
1522
#endif
1523
    assert(src1 == src2);
1524
}
1525

    
1526
/* This is almost identical to the previous, end exists only because
1527
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1528
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1529
{
1530
#if HAVE_MMX
1531
    __asm__ volatile(
1532
    "mov                  %0, %%"REG_a"         \n\t"
1533
    "1:                                         \n\t"
1534
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1535
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1536
    "psrlw                $8, %%mm0             \n\t"
1537
    "psrlw                $8, %%mm1             \n\t"
1538
    "packuswb          %%mm1, %%mm0             \n\t"
1539
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1540
    "add                  $8, %%"REG_a"         \n\t"
1541
    " js                  1b                    \n\t"
1542
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1543
    : "%"REG_a
1544
    );
1545
#else
1546
    int i;
1547
    for (i=0; i<width; i++)
1548
        dst[i]= src[2*i+1];
1549
#endif
1550
}
1551

    
1552
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1553
{
1554
#if HAVE_MMX
1555
    __asm__ volatile(
1556
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1557
    "mov                    %0, %%"REG_a"       \n\t"
1558
    "1:                                         \n\t"
1559
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1560
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1561
    "pand                %%mm4, %%mm0           \n\t"
1562
    "pand                %%mm4, %%mm1           \n\t"
1563
    "packuswb            %%mm1, %%mm0           \n\t"
1564
    "movq                %%mm0, %%mm1           \n\t"
1565
    "psrlw                  $8, %%mm0           \n\t"
1566
    "pand                %%mm4, %%mm1           \n\t"
1567
    "packuswb            %%mm0, %%mm0           \n\t"
1568
    "packuswb            %%mm1, %%mm1           \n\t"
1569
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1570
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1571
    "add                    $4, %%"REG_a"       \n\t"
1572
    " js                    1b                  \n\t"
1573
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1574
    : "%"REG_a
1575
    );
1576
#else
1577
    int i;
1578
    for (i=0; i<width; i++)
1579
    {
1580
        dstU[i]= src1[4*i + 0];
1581
        dstV[i]= src1[4*i + 2];
1582
    }
1583
#endif
1584
    assert(src1 == src2);
1585
}
1586

    
1587
#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1588
static inline void RENAME(name)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)\
1589
{\
1590
    int i;\
1591
    for (i=0; i<width; i++)\
1592
    {\
1593
        int b= (((type*)src)[i]>>shb)&maskb;\
1594
        int g= (((type*)src)[i]>>shg)&maskg;\
1595
        int r= (((type*)src)[i]>>shr)&maskr;\
1596
\
1597
        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1598
    }\
1599
}
1600

    
1601
BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1602
BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1603
BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1604
BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1605
BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1606
BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1607

    
1608
#define BGR2UV(type, name, shr, shg, shb, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1609
static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1610
{\
1611
    int i;\
1612
    for (i=0; i<width; i++)\
1613
    {\
1614
        int b= (((type*)src)[i]&maskb)>>shb;\
1615
        int g= (((type*)src)[i]&maskg)>>shg;\
1616
        int r= (((type*)src)[i]&maskr)>>shr;\
1617
\
1618
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1619
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1620
    }\
1621
}\
1622
static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, uint8_t *dummy, long width, uint32_t *unused)\
1623
{\
1624
    int i;\
1625
    for (i=0; i<width; i++)\
1626
    {\
1627
        int pix0= ((type*)src)[2*i+0];\
1628
        int pix1= ((type*)src)[2*i+1];\
1629
        int g= (pix0&maskg)+(pix1&maskg);\
1630
        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1631
        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1632
\
1633
        g>>=shg;\
1634
\
1635
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1636
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1637
    }\
1638
}
1639

    
1640
BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF0000, 0xFF00, 0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1641
BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1642
BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1643
BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1644
BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1645
BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1646

    
1647
#if HAVE_MMX
1648
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, uint8_t *src, long width, int srcFormat)
1649
{
1650

    
1651
    if(srcFormat == PIX_FMT_BGR24){
1652
        __asm__ volatile(
1653
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1654
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1655
            :
1656
        );
1657
    }else{
1658
        __asm__ volatile(
1659
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1660
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1661
            :
1662
        );
1663
    }
1664

    
1665
    __asm__ volatile(
1666
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1667
        "mov                        %2, %%"REG_a"   \n\t"
1668
        "pxor                    %%mm7, %%mm7       \n\t"
1669
        "1:                                         \n\t"
1670
        PREFETCH"               64(%0)              \n\t"
1671
        "movd                     (%0), %%mm0       \n\t"
1672
        "movd                    2(%0), %%mm1       \n\t"
1673
        "movd                    6(%0), %%mm2       \n\t"
1674
        "movd                    8(%0), %%mm3       \n\t"
1675
        "add                       $12, %0          \n\t"
1676
        "punpcklbw               %%mm7, %%mm0       \n\t"
1677
        "punpcklbw               %%mm7, %%mm1       \n\t"
1678
        "punpcklbw               %%mm7, %%mm2       \n\t"
1679
        "punpcklbw               %%mm7, %%mm3       \n\t"
1680
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1681
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1682
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1683
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1684
        "paddd                   %%mm1, %%mm0       \n\t"
1685
        "paddd                   %%mm3, %%mm2       \n\t"
1686
        "paddd                   %%mm4, %%mm0       \n\t"
1687
        "paddd                   %%mm4, %%mm2       \n\t"
1688
        "psrad                     $15, %%mm0       \n\t"
1689
        "psrad                     $15, %%mm2       \n\t"
1690
        "packssdw                %%mm2, %%mm0       \n\t"
1691
        "packuswb                %%mm0, %%mm0       \n\t"
1692
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1693
        "add                        $4, %%"REG_a"   \n\t"
1694
        " js                        1b              \n\t"
1695
    : "+r" (src)
1696
    : "r" (dst+width), "g" (-width)
1697
    : "%"REG_a
1698
    );
1699
}
1700

    
1701
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, uint8_t *src, long width, int srcFormat)
1702
{
1703
    __asm__ volatile(
1704
        "movq                    24+%4, %%mm6       \n\t"
1705
        "mov                        %3, %%"REG_a"   \n\t"
1706
        "pxor                    %%mm7, %%mm7       \n\t"
1707
        "1:                                         \n\t"
1708
        PREFETCH"               64(%0)              \n\t"
1709
        "movd                     (%0), %%mm0       \n\t"
1710
        "movd                    2(%0), %%mm1       \n\t"
1711
        "punpcklbw               %%mm7, %%mm0       \n\t"
1712
        "punpcklbw               %%mm7, %%mm1       \n\t"
1713
        "movq                    %%mm0, %%mm2       \n\t"
1714
        "movq                    %%mm1, %%mm3       \n\t"
1715
        "pmaddwd                    %4, %%mm0       \n\t"
1716
        "pmaddwd                  8+%4, %%mm1       \n\t"
1717
        "pmaddwd                 16+%4, %%mm2       \n\t"
1718
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1719
        "paddd                   %%mm1, %%mm0       \n\t"
1720
        "paddd                   %%mm3, %%mm2       \n\t"
1721

    
1722
        "movd                    6(%0), %%mm1       \n\t"
1723
        "movd                    8(%0), %%mm3       \n\t"
1724
        "add                       $12, %0          \n\t"
1725
        "punpcklbw               %%mm7, %%mm1       \n\t"
1726
        "punpcklbw               %%mm7, %%mm3       \n\t"
1727
        "movq                    %%mm1, %%mm4       \n\t"
1728
        "movq                    %%mm3, %%mm5       \n\t"
1729
        "pmaddwd                    %4, %%mm1       \n\t"
1730
        "pmaddwd                  8+%4, %%mm3       \n\t"
1731
        "pmaddwd                 16+%4, %%mm4       \n\t"
1732
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1733
        "paddd                   %%mm3, %%mm1       \n\t"
1734
        "paddd                   %%mm5, %%mm4       \n\t"
1735

    
1736
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1737
        "paddd                   %%mm3, %%mm0       \n\t"
1738
        "paddd                   %%mm3, %%mm2       \n\t"
1739
        "paddd                   %%mm3, %%mm1       \n\t"
1740
        "paddd                   %%mm3, %%mm4       \n\t"
1741
        "psrad                     $15, %%mm0       \n\t"
1742
        "psrad                     $15, %%mm2       \n\t"
1743
        "psrad                     $15, %%mm1       \n\t"
1744
        "psrad                     $15, %%mm4       \n\t"
1745
        "packssdw                %%mm1, %%mm0       \n\t"
1746
        "packssdw                %%mm4, %%mm2       \n\t"
1747
        "packuswb                %%mm0, %%mm0       \n\t"
1748
        "packuswb                %%mm2, %%mm2       \n\t"
1749
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1750
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1751
        "add                        $4, %%"REG_a"   \n\t"
1752
        " js                        1b              \n\t"
1753
    : "+r" (src)
1754
    : "r" (dstU+width), "r" (dstV+width), "g" (-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1755
    : "%"REG_a
1756
    );
1757
}
1758
#endif
1759

    
1760
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1761
{
1762
#if HAVE_MMX
1763
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1764
#else
1765
    int i;
1766
    for (i=0; i<width; i++)
1767
    {
1768
        int b= src[i*3+0];
1769
        int g= src[i*3+1];
1770
        int r= src[i*3+2];
1771

    
1772
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1773
    }
1774
#endif /* HAVE_MMX */
1775
}
1776

    
1777
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1778
{
1779
#if HAVE_MMX
1780
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1781
#else
1782
    int i;
1783
    for (i=0; i<width; i++)
1784
    {
1785
        int b= src1[3*i + 0];
1786
        int g= src1[3*i + 1];
1787
        int r= src1[3*i + 2];
1788

    
1789
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1790
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1791
    }
1792
#endif /* HAVE_MMX */
1793
    assert(src1 == src2);
1794
}
1795

    
1796
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1797
{
1798
    int i;
1799
    for (i=0; i<width; i++)
1800
    {
1801
        int b= src1[6*i + 0] + src1[6*i + 3];
1802
        int g= src1[6*i + 1] + src1[6*i + 4];
1803
        int r= src1[6*i + 2] + src1[6*i + 5];
1804

    
1805
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1806
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1807
    }
1808
    assert(src1 == src2);
1809
}
1810

    
1811
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1812
{
1813
#if HAVE_MMX
1814
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1815
#else
1816
    int i;
1817
    for (i=0; i<width; i++)
1818
    {
1819
        int r= src[i*3+0];
1820
        int g= src[i*3+1];
1821
        int b= src[i*3+2];
1822

    
1823
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1824
    }
1825
#endif
1826
}
1827

    
1828
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1829
{
1830
#if HAVE_MMX
1831
    assert(src1==src2);
1832
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1833
#else
1834
    int i;
1835
    assert(src1==src2);
1836
    for (i=0; i<width; i++)
1837
    {
1838
        int r= src1[3*i + 0];
1839
        int g= src1[3*i + 1];
1840
        int b= src1[3*i + 2];
1841

    
1842
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1843
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1844
    }
1845
#endif
1846
}
1847

    
1848
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *unused)
1849
{
1850
    int i;
1851
    assert(src1==src2);
1852
    for (i=0; i<width; i++)
1853
    {
1854
        int r= src1[6*i + 0] + src1[6*i + 3];
1855
        int g= src1[6*i + 1] + src1[6*i + 4];
1856
        int b= src1[6*i + 2] + src1[6*i + 5];
1857

    
1858
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1859
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1860
    }
1861
}
1862

    
1863

    
1864
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, long width, uint32_t *pal)
1865
{
1866
    int i;
1867
    for (i=0; i<width; i++)
1868
    {
1869
        int d= src[i];
1870

    
1871
        dst[i]= pal[d] & 0xFF;
1872
    }
1873
}
1874

    
1875
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width, uint32_t *pal)
1876
{
1877
    int i;
1878
    assert(src1 == src2);
1879
    for (i=0; i<width; i++)
1880
    {
1881
        int p= pal[src1[i]];
1882

    
1883
        dstU[i]= p>>8;
1884
        dstV[i]= p>>16;
1885
    }
1886
}
1887

    
1888
static inline void RENAME(monowhite2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1889
{
1890
    int i, j;
1891
    for (i=0; i<width/8; i++){
1892
        int d= ~src[i];
1893
        for(j=0; j<8; j++)
1894
            dst[8*i+j]= ((d>>(7-j))&1)*255;
1895
    }
1896
}
1897

    
1898
static inline void RENAME(monoblack2Y)(uint8_t *dst, uint8_t *src, long width, uint32_t *unused)
1899
{
1900
    int i, j;
1901
    for (i=0; i<width/8; i++){
1902
        int d= src[i];
1903
        for(j=0; j<8; j++)
1904
            dst[8*i+j]= ((d>>(7-j))&1)*255;
1905
    }
1906
}
1907

    
1908
// bilinear / bicubic scaling
1909
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
1910
                                  int16_t *filter, int16_t *filterPos, long filterSize)
1911
{
1912
#if HAVE_MMX
1913
    assert(filterSize % 4 == 0 && filterSize>0);
1914
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
1915
    {
1916
        long counter= -2*dstW;
1917
        filter-= counter*2;
1918
        filterPos-= counter/2;
1919
        dst-= counter/2;
1920
        __asm__ volatile(
1921
#if defined(PIC)
1922
        "push            %%"REG_b"              \n\t"
1923
#endif
1924
        "pxor                %%mm7, %%mm7       \n\t"
1925
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
1926
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
1927
        ASMALIGN(4)
1928
        "1:                                     \n\t"
1929
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
1930
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
1931
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
1932
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
1933
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
1934
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
1935
        "punpcklbw           %%mm7, %%mm0       \n\t"
1936
        "punpcklbw           %%mm7, %%mm2       \n\t"
1937
        "pmaddwd             %%mm1, %%mm0       \n\t"
1938
        "pmaddwd             %%mm2, %%mm3       \n\t"
1939
        "movq                %%mm0, %%mm4       \n\t"
1940
        "punpckldq           %%mm3, %%mm0       \n\t"
1941
        "punpckhdq           %%mm3, %%mm4       \n\t"
1942
        "paddd               %%mm4, %%mm0       \n\t"
1943
        "psrad                  $7, %%mm0       \n\t"
1944
        "packssdw            %%mm0, %%mm0       \n\t"
1945
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
1946
        "add                    $4, %%"REG_BP"  \n\t"
1947
        " jnc                   1b              \n\t"
1948

    
1949
        "pop            %%"REG_BP"              \n\t"
1950
#if defined(PIC)
1951
        "pop             %%"REG_b"              \n\t"
1952
#endif
1953
        : "+a" (counter)
1954
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1955
#if !defined(PIC)
1956
        : "%"REG_b
1957
#endif
1958
        );
1959
    }
1960
    else if (filterSize==8)
1961
    {
1962
        long counter= -2*dstW;
1963
        filter-= counter*4;
1964
        filterPos-= counter/2;
1965
        dst-= counter/2;
1966
        __asm__ volatile(
1967
#if defined(PIC)
1968
        "push             %%"REG_b"             \n\t"
1969
#endif
1970
        "pxor                 %%mm7, %%mm7      \n\t"
1971
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
1972
        "mov              %%"REG_a", %%"REG_BP" \n\t"
1973
        ASMALIGN(4)
1974
        "1:                                     \n\t"
1975
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
1976
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
1977
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
1978
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
1979
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
1980
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
1981
        "punpcklbw            %%mm7, %%mm0      \n\t"
1982
        "punpcklbw            %%mm7, %%mm2      \n\t"
1983
        "pmaddwd              %%mm1, %%mm0      \n\t"
1984
        "pmaddwd              %%mm2, %%mm3      \n\t"
1985

    
1986
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
1987
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
1988
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
1989
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
1990
        "punpcklbw            %%mm7, %%mm4      \n\t"
1991
        "punpcklbw            %%mm7, %%mm2      \n\t"
1992
        "pmaddwd              %%mm1, %%mm4      \n\t"
1993
        "pmaddwd              %%mm2, %%mm5      \n\t"
1994
        "paddd                %%mm4, %%mm0      \n\t"
1995
        "paddd                %%mm5, %%mm3      \n\t"
1996
        "movq                 %%mm0, %%mm4      \n\t"
1997
        "punpckldq            %%mm3, %%mm0      \n\t"
1998
        "punpckhdq            %%mm3, %%mm4      \n\t"
1999
        "paddd                %%mm4, %%mm0      \n\t"
2000
        "psrad                   $7, %%mm0      \n\t"
2001
        "packssdw             %%mm0, %%mm0      \n\t"
2002
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2003
        "add                     $4, %%"REG_BP" \n\t"
2004
        " jnc                    1b             \n\t"
2005

    
2006
        "pop             %%"REG_BP"             \n\t"
2007
#if defined(PIC)
2008
        "pop              %%"REG_b"             \n\t"
2009
#endif
2010
        : "+a" (counter)
2011
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2012
#if !defined(PIC)
2013
        : "%"REG_b
2014
#endif
2015
        );
2016
    }
2017
    else
2018
    {
2019
        uint8_t *offset = src+filterSize;
2020
        long counter= -2*dstW;
2021
        //filter-= counter*filterSize/2;
2022
        filterPos-= counter/2;
2023
        dst-= counter/2;
2024
        __asm__ volatile(
2025
        "pxor                  %%mm7, %%mm7     \n\t"
2026
        ASMALIGN(4)
2027
        "1:                                     \n\t"
2028
        "mov                      %2, %%"REG_c" \n\t"
2029
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2030
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2031
        "mov                      %5, %%"REG_c" \n\t"
2032
        "pxor                  %%mm4, %%mm4     \n\t"
2033
        "pxor                  %%mm5, %%mm5     \n\t"
2034
        "2:                                     \n\t"
2035
        "movq                   (%1), %%mm1     \n\t"
2036
        "movq               (%1, %6), %%mm3     \n\t"
2037
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2038
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2039
        "punpcklbw             %%mm7, %%mm0     \n\t"
2040
        "punpcklbw             %%mm7, %%mm2     \n\t"
2041
        "pmaddwd               %%mm1, %%mm0     \n\t"
2042
        "pmaddwd               %%mm2, %%mm3     \n\t"
2043
        "paddd                 %%mm3, %%mm5     \n\t"
2044
        "paddd                 %%mm0, %%mm4     \n\t"
2045
        "add                      $8, %1        \n\t"
2046
        "add                      $4, %%"REG_c" \n\t"
2047
        "cmp                      %4, %%"REG_c" \n\t"
2048
        " jb                      2b            \n\t"
2049
        "add                      %6, %1        \n\t"
2050
        "movq                  %%mm4, %%mm0     \n\t"
2051
        "punpckldq             %%mm5, %%mm4     \n\t"
2052
        "punpckhdq             %%mm5, %%mm0     \n\t"
2053
        "paddd                 %%mm0, %%mm4     \n\t"
2054
        "psrad                    $7, %%mm4     \n\t"
2055
        "packssdw              %%mm4, %%mm4     \n\t"
2056
        "mov                      %3, %%"REG_a" \n\t"
2057
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2058
        "add                      $4, %0        \n\t"
2059
        " jnc                     1b            \n\t"
2060

    
2061
        : "+r" (counter), "+r" (filter)
2062
        : "m" (filterPos), "m" (dst), "m"(offset),
2063
          "m" (src), "r" (filterSize*2)
2064
        : "%"REG_a, "%"REG_c, "%"REG_d
2065
        );
2066
    }
2067
#else
2068
#if HAVE_ALTIVEC
2069
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2070
#else
2071
    int i;
2072
    for (i=0; i<dstW; i++)
2073
    {
2074
        int j;
2075
        int srcPos= filterPos[i];
2076
        int val=0;
2077
        //printf("filterPos: %d\n", filterPos[i]);
2078
        for (j=0; j<filterSize; j++)
2079
        {
2080
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2081
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2082
        }
2083
        //filter += hFilterSize;
2084
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2085
        //dst[i] = val>>7;
2086
    }
2087
#endif /* HAVE_ALTIVEC */
2088
#endif /* HAVE_MMX */
2089
}
2090
      // *** horizontal scale Y line to temp buffer
2091
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2092
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2093
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2094
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2095
                                   int32_t *mmx2FilterPos, uint32_t *pal)
2096
{
2097
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2098
    {
2099
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW, pal);
2100
        src= formatConvBuffer;
2101
    }
2102
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2103
    {
2104
        RENAME(uyvyToY)(formatConvBuffer, src, srcW, pal);
2105
        src= formatConvBuffer;
2106
    }
2107
    else if (srcFormat==PIX_FMT_RGB32)
2108
    {
2109
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW, pal);
2110
        src= formatConvBuffer;
2111
    }
2112
    else if (srcFormat==PIX_FMT_RGB32_1)
2113
    {
2114
        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2115
        src= formatConvBuffer;
2116
    }
2117
    else if (srcFormat==PIX_FMT_BGR24)
2118
    {
2119
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW, pal);
2120
        src= formatConvBuffer;
2121
    }
2122
    else if (srcFormat==PIX_FMT_BGR565)
2123
    {
2124
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW, pal);
2125
        src= formatConvBuffer;
2126
    }
2127
    else if (srcFormat==PIX_FMT_BGR555)
2128
    {
2129
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW, pal);
2130
        src= formatConvBuffer;
2131
    }
2132
    else if (srcFormat==PIX_FMT_BGR32)
2133
    {
2134
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW, pal);
2135
        src= formatConvBuffer;
2136
    }
2137
    else if (srcFormat==PIX_FMT_BGR32_1)
2138
    {
2139
        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW, pal);
2140
        src= formatConvBuffer;
2141
    }
2142
    else if (srcFormat==PIX_FMT_RGB24)
2143
    {
2144
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW, pal);
2145
        src= formatConvBuffer;
2146
    }
2147
    else if (srcFormat==PIX_FMT_RGB565)
2148
    {
2149
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW, pal);
2150
        src= formatConvBuffer;
2151
    }
2152
    else if (srcFormat==PIX_FMT_RGB555)
2153
    {
2154
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW, pal);
2155
        src= formatConvBuffer;
2156
    }
2157
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2158
    {
2159
        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2160
        src= formatConvBuffer;
2161
    }
2162
    else if (srcFormat==PIX_FMT_MONOBLACK)
2163
    {
2164
        RENAME(monoblack2Y)(formatConvBuffer, src, srcW, pal);
2165
        src= formatConvBuffer;
2166
    }
2167
    else if (srcFormat==PIX_FMT_MONOWHITE)
2168
    {
2169
        RENAME(monowhite2Y)(formatConvBuffer, src, srcW, pal);
2170
        src= formatConvBuffer;
2171
    }
2172

    
2173
#if HAVE_MMX
2174
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2175
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2176
#else
2177
    if (!(flags&SWS_FAST_BILINEAR))
2178
#endif
2179
    {
2180
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2181
    }
2182
    else // fast bilinear upscale / crap downscale
2183
    {
2184
#if ARCH_X86
2185
#if HAVE_MMX2
2186
        int i;
2187
#if defined(PIC)
2188
        uint64_t ebxsave __attribute__((aligned(8)));
2189
#endif
2190
        if (canMMX2BeUsed)
2191
        {
2192
            __asm__ volatile(
2193
#if defined(PIC)
2194
            "mov               %%"REG_b", %5        \n\t"
2195
#endif
2196
            "pxor                  %%mm7, %%mm7     \n\t"
2197
            "mov                      %0, %%"REG_c" \n\t"
2198
            "mov                      %1, %%"REG_D" \n\t"
2199
            "mov                      %2, %%"REG_d" \n\t"
2200
            "mov                      %3, %%"REG_b" \n\t"
2201
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2202
            PREFETCH"        (%%"REG_c")            \n\t"
2203
            PREFETCH"      32(%%"REG_c")            \n\t"
2204
            PREFETCH"      64(%%"REG_c")            \n\t"
2205

    
2206
#if ARCH_X86_64
2207

    
2208
#define FUNNY_Y_CODE \
2209
            "movl            (%%"REG_b"), %%esi     \n\t"\
2210
            "call                    *%4            \n\t"\
2211
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2212
            "add               %%"REG_S", %%"REG_c" \n\t"\
2213
            "add               %%"REG_a", %%"REG_D" \n\t"\
2214
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2215

    
2216
#else
2217

    
2218
#define FUNNY_Y_CODE \
2219
            "movl (%%"REG_b"), %%esi        \n\t"\
2220
            "call         *%4                       \n\t"\
2221
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2222
            "add               %%"REG_a", %%"REG_D" \n\t"\
2223
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2224

    
2225
#endif /* ARCH_X86_64 */
2226

    
2227
FUNNY_Y_CODE
2228
FUNNY_Y_CODE
2229
FUNNY_Y_CODE
2230
FUNNY_Y_CODE
2231
FUNNY_Y_CODE
2232
FUNNY_Y_CODE
2233
FUNNY_Y_CODE
2234
FUNNY_Y_CODE
2235

    
2236
#if defined(PIC)
2237
            "mov                      %5, %%"REG_b" \n\t"
2238
#endif
2239
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2240
            "m" (funnyYCode)
2241
#if defined(PIC)
2242
            ,"m" (ebxsave)
2243
#endif
2244
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2245
#if !defined(PIC)
2246
            ,"%"REG_b
2247
#endif
2248
            );
2249
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2250
        }
2251
        else
2252
        {
2253
#endif /* HAVE_MMX2 */
2254
        long xInc_shr16 = xInc >> 16;
2255
        uint16_t xInc_mask = xInc & 0xffff;
2256
        //NO MMX just normal asm ...
2257
        __asm__ volatile(
2258
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2259
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2260
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2261
        ASMALIGN(4)
2262
        "1:                                  \n\t"
2263
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2264
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2265
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2266
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2267
        "shll      $16, %%edi                \n\t"
2268
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2269
        "mov        %1, %%"REG_D"            \n\t"
2270
        "shrl       $9, %%esi                \n\t"
2271
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2272
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2273
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2274

    
2275
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2276
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2277
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2278
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2279
        "shll      $16, %%edi                \n\t"
2280
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2281
        "mov        %1, %%"REG_D"            \n\t"
2282
        "shrl       $9, %%esi                \n\t"
2283
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2284
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2285
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2286

    
2287

    
2288
        "add        $2, %%"REG_a"            \n\t"
2289
        "cmp        %2, %%"REG_a"            \n\t"
2290
        " jb        1b                       \n\t"
2291

    
2292

    
2293
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2294
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2295
        );
2296
#if HAVE_MMX2
2297
        } //if MMX2 can't be used
2298
#endif
2299
#else
2300
        int i;
2301
        unsigned int xpos=0;
2302
        for (i=0;i<dstWidth;i++)
2303
        {
2304
            register unsigned int xx=xpos>>16;
2305
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2306
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2307
            xpos+=xInc;
2308
        }
2309
#endif /* ARCH_X86 */
2310
    }
2311

    
2312
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2313
        int i;
2314
        //FIXME all pal and rgb srcFormats could do this convertion as well
2315
        //FIXME all scalers more complex than bilinear could do half of this transform
2316
        if(c->srcRange){
2317
            for (i=0; i<dstWidth; i++)
2318
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2319
        }else{
2320
            for (i=0; i<dstWidth; i++)
2321
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2322
        }
2323
    }
2324
}
2325

    
2326
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2327
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2328
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2329
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2330
                                   int32_t *mmx2FilterPos, uint32_t *pal)
2331
{
2332
    if (srcFormat==PIX_FMT_YUYV422)
2333
    {
2334
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2335
        src1= formatConvBuffer;
2336
        src2= formatConvBuffer+VOFW;
2337
    }
2338
    else if (srcFormat==PIX_FMT_UYVY422)
2339
    {
2340
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2341
        src1= formatConvBuffer;
2342
        src2= formatConvBuffer+VOFW;
2343
    }
2344
    else if (srcFormat==PIX_FMT_RGB32)
2345
    {
2346
        if(c->chrSrcHSubSample)
2347
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2348
        else
2349
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2350
        src1= formatConvBuffer;
2351
        src2= formatConvBuffer+VOFW;
2352
    }
2353
    else if (srcFormat==PIX_FMT_RGB32_1)
2354
    {
2355
        if(c->chrSrcHSubSample)
2356
            RENAME(bgr32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2357
        else
2358
            RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2359
        src1= formatConvBuffer;
2360
        src2= formatConvBuffer+VOFW;
2361
    }
2362
    else if (srcFormat==PIX_FMT_BGR24)
2363
    {
2364
        if(c->chrSrcHSubSample)
2365
            RENAME(bgr24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2366
        else
2367
            RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2368
        src1= formatConvBuffer;
2369
        src2= formatConvBuffer+VOFW;
2370
    }
2371
    else if (srcFormat==PIX_FMT_BGR565)
2372
    {
2373
        if(c->chrSrcHSubSample)
2374
            RENAME(bgr16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2375
        else
2376
            RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2377
        src1= formatConvBuffer;
2378
        src2= formatConvBuffer+VOFW;
2379
    }
2380
    else if (srcFormat==PIX_FMT_BGR555)
2381
    {
2382
        if(c->chrSrcHSubSample)
2383
            RENAME(bgr15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2384
        else
2385
            RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2386
        src1= formatConvBuffer;
2387
        src2= formatConvBuffer+VOFW;
2388
    }
2389
    else if (srcFormat==PIX_FMT_BGR32)
2390
    {
2391
        if(c->chrSrcHSubSample)
2392
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2393
        else
2394
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2395
        src1= formatConvBuffer;
2396
        src2= formatConvBuffer+VOFW;
2397
    }
2398
    else if (srcFormat==PIX_FMT_BGR32_1)
2399
    {
2400
        if(c->chrSrcHSubSample)
2401
            RENAME(rgb32ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2402
        else
2403
            RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW, pal);
2404
        src1= formatConvBuffer;
2405
        src2= formatConvBuffer+VOFW;
2406
    }
2407
    else if (srcFormat==PIX_FMT_RGB24)
2408
    {
2409
        if(c->chrSrcHSubSample)
2410
            RENAME(rgb24ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2411
        else
2412
            RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2413
        src1= formatConvBuffer;
2414
        src2= formatConvBuffer+VOFW;
2415
    }
2416
    else if (srcFormat==PIX_FMT_RGB565)
2417
    {
2418
        if(c->chrSrcHSubSample)
2419
            RENAME(rgb16ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2420
        else
2421
            RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2422
        src1= formatConvBuffer;
2423
        src2= formatConvBuffer+VOFW;
2424
    }
2425
    else if (srcFormat==PIX_FMT_RGB555)
2426
    {
2427
        if(c->chrSrcHSubSample)
2428
            RENAME(rgb15ToUV_half)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2429
        else
2430
            RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2431
        src1= formatConvBuffer;
2432
        src2= formatConvBuffer+VOFW;
2433
    }
2434
    else if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2435
    {
2436
        return;
2437
    }
2438
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2439
    {
2440
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2441
        src1= formatConvBuffer;
2442
        src2= formatConvBuffer+VOFW;
2443
    }
2444

    
2445
#if HAVE_MMX
2446
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2447
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2448
#else
2449
    if (!(flags&SWS_FAST_BILINEAR))
2450
#endif
2451
    {
2452
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2453
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2454
    }
2455
    else // fast bilinear upscale / crap downscale
2456
    {
2457
#if ARCH_X86
2458
#if HAVE_MMX2
2459
        int i;
2460
#if defined(PIC)
2461
        uint64_t ebxsave __attribute__((aligned(8)));
2462
#endif
2463
        if (canMMX2BeUsed)
2464
        {
2465
            __asm__ volatile(
2466
#if defined(PIC)
2467
            "mov          %%"REG_b", %6         \n\t"
2468
#endif
2469
            "pxor             %%mm7, %%mm7      \n\t"
2470
            "mov                 %0, %%"REG_c"  \n\t"
2471
            "mov                 %1, %%"REG_D"  \n\t"
2472
            "mov                 %2, %%"REG_d"  \n\t"
2473
            "mov                 %3, %%"REG_b"  \n\t"
2474
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2475
            PREFETCH"   (%%"REG_c")             \n\t"
2476
            PREFETCH" 32(%%"REG_c")             \n\t"
2477
            PREFETCH" 64(%%"REG_c")             \n\t"
2478

    
2479
#if ARCH_X86_64
2480

    
2481
#define FUNNY_UV_CODE \
2482
            "movl       (%%"REG_b"), %%esi      \n\t"\
2483
            "call               *%4             \n\t"\
2484
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2485
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2486
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2487
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2488

    
2489
#else
2490

    
2491
#define FUNNY_UV_CODE \
2492
            "movl       (%%"REG_b"), %%esi      \n\t"\
2493
            "call               *%4             \n\t"\
2494
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2495
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2496
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2497

    
2498
#endif /* ARCH_X86_64 */
2499

    
2500
FUNNY_UV_CODE
2501
FUNNY_UV_CODE
2502
FUNNY_UV_CODE
2503
FUNNY_UV_CODE
2504
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2505
            "mov                 %5, %%"REG_c"  \n\t" // src
2506
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2507
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2508
            PREFETCH"   (%%"REG_c")             \n\t"
2509
            PREFETCH" 32(%%"REG_c")             \n\t"
2510
            PREFETCH" 64(%%"REG_c")             \n\t"
2511

    
2512
FUNNY_UV_CODE
2513
FUNNY_UV_CODE
2514
FUNNY_UV_CODE
2515
FUNNY_UV_CODE
2516

    
2517
#if defined(PIC)
2518
            "mov %6, %%"REG_b"    \n\t"
2519
#endif
2520
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2521
            "m" (funnyUVCode), "m" (src2)
2522
#if defined(PIC)
2523
            ,"m" (ebxsave)
2524
#endif
2525
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2526
#if !defined(PIC)
2527
             ,"%"REG_b
2528
#endif
2529
            );
2530
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2531
            {
2532
                //printf("%d %d %d\n", dstWidth, i, srcW);
2533
                dst[i] = src1[srcW-1]*128;
2534
                dst[i+VOFW] = src2[srcW-1]*128;
2535
            }
2536
        }
2537
        else
2538
        {
2539
#endif /* HAVE_MMX2 */
2540
            long xInc_shr16 = (long) (xInc >> 16);
2541
            uint16_t xInc_mask = xInc & 0xffff;
2542
            __asm__ volatile(
2543
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2544
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2545
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2546
            ASMALIGN(4)
2547
            "1:                                     \n\t"
2548
            "mov        %0, %%"REG_S"               \n\t"
2549
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2550
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2551
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2552
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2553
            "shll      $16, %%edi                   \n\t"
2554
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2555
            "mov        %1, %%"REG_D"               \n\t"
2556
            "shrl       $9, %%esi                   \n\t"
2557
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2558

    
2559
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2560
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2561
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2562
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2563
            "shll      $16, %%edi                   \n\t"
2564
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2565
            "mov        %1, %%"REG_D"               \n\t"
2566
            "shrl       $9, %%esi                   \n\t"
2567
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2568

    
2569
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2570
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2571
            "add        $1, %%"REG_a"               \n\t"
2572
            "cmp        %2, %%"REG_a"               \n\t"
2573
            " jb        1b                          \n\t"
2574

    
2575
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2576
   which is needed to support GCC 4.0. */
2577
#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2578
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2579
#else
2580
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2581
#endif
2582
            "r" (src2)
2583
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2584
            );
2585
#if HAVE_MMX2
2586
        } //if MMX2 can't be used
2587
#endif
2588
#else
2589
        int i;
2590
        unsigned int xpos=0;
2591
        for (i=0;i<dstWidth;i++)
2592
        {
2593
            register unsigned int xx=xpos>>16;
2594
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2595
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2596
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2597
            /* slower
2598
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2599
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2600
            */
2601
            xpos+=xInc;
2602
        }
2603
#endif /* ARCH_X86 */
2604
    }
2605
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2606
        int i;
2607
        //FIXME all pal and rgb srcFormats could do this convertion as well
2608
        //FIXME all scalers more complex than bilinear could do half of this transform
2609
        if(c->srcRange){
2610
            for (i=0; i<dstWidth; i++){
2611
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2612
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2613
            }
2614
        }else{
2615
            for (i=0; i<dstWidth; i++){
2616
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2617
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2618
            }
2619
        }
2620
    }
2621
}
2622

    
2623
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2624
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2625

    
2626
    /* load a few things into local vars to make the code more readable? and faster */
2627
    const int srcW= c->srcW;
2628
    const int dstW= c->dstW;
2629
    const int dstH= c->dstH;
2630
    const int chrDstW= c->chrDstW;
2631
    const int chrSrcW= c->chrSrcW;
2632
    const int lumXInc= c->lumXInc;
2633
    const int chrXInc= c->chrXInc;
2634
    const int dstFormat= c->dstFormat;
2635
    const int srcFormat= c->srcFormat;
2636
    const int flags= c->flags;
2637
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2638
    int16_t *vLumFilterPos= c->vLumFilterPos;
2639
    int16_t *vChrFilterPos= c->vChrFilterPos;
2640
    int16_t *hLumFilterPos= c->hLumFilterPos;
2641
    int16_t *hChrFilterPos= c->hChrFilterPos;
2642
    int16_t *vLumFilter= c->vLumFilter;
2643
    int16_t *vChrFilter= c->vChrFilter;
2644
    int16_t *hLumFilter= c->hLumFilter;
2645
    int16_t *hChrFilter= c->hChrFilter;
2646
    int32_t *lumMmxFilter= c->lumMmxFilter;
2647
    int32_t *chrMmxFilter= c->chrMmxFilter;
2648
    const int vLumFilterSize= c->vLumFilterSize;
2649
    const int vChrFilterSize= c->vChrFilterSize;
2650
    const int hLumFilterSize= c->hLumFilterSize;
2651
    const int hChrFilterSize= c->hChrFilterSize;
2652
    int16_t **lumPixBuf= c->lumPixBuf;
2653
    int16_t **chrPixBuf= c->chrPixBuf;
2654
    const int vLumBufSize= c->vLumBufSize;
2655
    const int vChrBufSize= c->vChrBufSize;
2656
    uint8_t *funnyYCode= c->funnyYCode;
2657
    uint8_t *funnyUVCode= c->funnyUVCode;
2658
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2659
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2660
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2661
    int lastDstY;
2662
    uint32_t *pal=c->pal_yuv;
2663

    
2664
    /* vars which will change and which we need to store back in the context */
2665
    int dstY= c->dstY;
2666
    int lumBufIndex= c->lumBufIndex;
2667
    int chrBufIndex= c->chrBufIndex;
2668
    int lastInLumBuf= c->lastInLumBuf;
2669
    int lastInChrBuf= c->lastInChrBuf;
2670

    
2671
    if (isPacked(c->srcFormat)){
2672
        src[0]=
2673
        src[1]=
2674
        src[2]= src[0];
2675
        srcStride[0]=
2676
        srcStride[1]=
2677
        srcStride[2]= srcStride[0];
2678
    }
2679
    srcStride[1]<<= c->vChrDrop;
2680
    srcStride[2]<<= c->vChrDrop;
2681

    
2682
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2683
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2684

    
2685
#if 0 //self test FIXME move to a vfilter or something
2686
    {
2687
    static volatile int i=0;
2688
    i++;
2689
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2690
        selfTest(src, srcStride, c->srcW, c->srcH);
2691
    i--;
2692
    }
2693
#endif
2694

    
2695
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2696
    //dstStride[0],dstStride[1],dstStride[2]);
2697

    
2698
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
2699
    {
2700
        static int warnedAlready=0; //FIXME move this into the context perhaps
2701
        if (flags & SWS_PRINT_INFO && !warnedAlready)
2702
        {
2703
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2704
                   "         ->cannot do aligned memory accesses anymore\n");
2705
            warnedAlready=1;
2706
        }
2707
    }
2708

    
2709
    /* Note the user might start scaling the picture in the middle so this
2710
       will not get executed. This is not really intended but works
2711
       currently, so people might do it. */
2712
    if (srcSliceY ==0){
2713
        lumBufIndex=0;
2714
        chrBufIndex=0;
2715
        dstY=0;
2716
        lastInLumBuf= -1;
2717
        lastInChrBuf= -1;
2718
    }
2719

    
2720
    lastDstY= dstY;
2721

    
2722
    for (;dstY < dstH; dstY++){
2723
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2724
        const int chrDstY= dstY>>c->chrDstVSubSample;
2725
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2726
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2727

    
2728
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2729
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2730
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2731
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2732

    
2733
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2734
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2735
        //handle holes (FAST_BILINEAR & weird filters)
2736
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2737
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2738
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2739
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2740
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2741

    
2742
        // Do we have enough lines in this slice to output the dstY line
2743
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2744
        {
2745
            //Do horizontal scaling
2746
            while(lastInLumBuf < lastLumSrcY)
2747
            {
2748
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2749
                lumBufIndex++;
2750
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2751
                assert(lumBufIndex < 2*vLumBufSize);
2752
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2753
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2754
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
2755
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2756
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2757
                                funnyYCode, c->srcFormat, formatConvBuffer,
2758
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2759
                lastInLumBuf++;
2760
            }
2761
            while(lastInChrBuf < lastChrSrcY)
2762
            {
2763
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2764
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2765
                chrBufIndex++;
2766
                assert(chrBufIndex < 2*vChrBufSize);
2767
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2768
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2769
                //FIXME replace parameters through context struct (some at least)
2770

    
2771
                if (!(isGray(srcFormat) || isGray(dstFormat)))
2772
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2773
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2774
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
2775
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2776
                lastInChrBuf++;
2777
            }
2778
            //wrap buf index around to stay inside the ring buffer
2779
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2780
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2781
        }
2782
        else // not enough lines left in this slice -> load the rest in the buffer
2783
        {
2784
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2785
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2786
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2787
            vChrBufSize, vLumBufSize);*/
2788

    
2789
            //Do horizontal scaling
2790
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2791
            {
2792
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2793
                lumBufIndex++;
2794
                assert(lumBufIndex < 2*vLumBufSize);
2795
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2796
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2797
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
2798
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
2799
                                funnyYCode, c->srcFormat, formatConvBuffer,
2800
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
2801
                lastInLumBuf++;
2802
            }
2803
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2804
            {
2805
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2806
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2807
                chrBufIndex++;
2808
                assert(chrBufIndex < 2*vChrBufSize);
2809
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2810
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2811

    
2812
                if (!(isGray(srcFormat) || isGray(dstFormat)))
2813
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2814
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
2815
                            funnyUVCode, c->srcFormat, formatConvBuffer,
2816
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
2817
                lastInChrBuf++;
2818
            }
2819
            //wrap buf index around to stay inside the ring buffer
2820
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2821
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2822
            break; //we can't output a dstY line so let's try with the next slice
2823
        }
2824

    
2825
#if HAVE_MMX
2826
        c->blueDither= ff_dither8[dstY&1];
2827
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2828
            c->greenDither= ff_dither8[dstY&1];
2829
        else
2830
            c->greenDither= ff_dither4[dstY&1];
2831
        c->redDither= ff_dither8[(dstY+1)&1];
2832
#endif
2833
        if (dstY < dstH-2)
2834
        {
2835
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2836
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2837
#if HAVE_MMX
2838
            int i;
2839
        if (flags & SWS_ACCURATE_RND){
2840
            int s= APCK_SIZE / 8;
2841
            for (i=0; i<vLumFilterSize; i+=2){
2842
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2843
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2844
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
2845
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2846
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2847
            }
2848
            for (i=0; i<vChrFilterSize; i+=2){
2849
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2850
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2851
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
2852
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2853
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2854
            }
2855
        }else{
2856
            for (i=0; i<vLumFilterSize; i++)
2857
            {
2858
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2859
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2860
                lumMmxFilter[4*i+2]=
2861
                lumMmxFilter[4*i+3]=
2862
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2863
            }
2864
            for (i=0; i<vChrFilterSize; i++)
2865
            {
2866
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2867
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2868
                chrMmxFilter[4*i+2]=
2869
                chrMmxFilter[4*i+3]=
2870
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2871
            }
2872
        }
2873
#endif
2874
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2875
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2876
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2877
                RENAME(yuv2nv12X)(c,
2878
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2879
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2880
                    dest, uDest, dstW, chrDstW, dstFormat);
2881
            }
2882
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2883
            {
2884
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2885
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2886
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2887
                {
2888
                    int16_t *lumBuf = lumPixBuf[0];
2889
                    int16_t *chrBuf= chrPixBuf[0];
2890
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
2891
                }
2892
                else //General YV12
2893
                {
2894
                    RENAME(yuv2yuvX)(c,
2895
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2896
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2897
                        dest, uDest, vDest, dstW, chrDstW);
2898
                }
2899
            }
2900
            else
2901
            {
2902
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2903
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2904
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2905
                {
2906
                    int chrAlpha= vChrFilter[2*dstY+1];
2907
                    if(flags & SWS_FULL_CHR_H_INT){
2908
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2909
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2910
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2911
                            dest, dstW, dstY);
2912
                    }else{
2913
                        RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2914
                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
2915
                    }
2916
                }
2917
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2918
                {
2919
                    int lumAlpha= vLumFilter[2*dstY+1];
2920
                    int chrAlpha= vChrFilter[2*dstY+1];
2921
                    lumMmxFilter[2]=
2922
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2923
                    chrMmxFilter[2]=
2924
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2925
                    if(flags & SWS_FULL_CHR_H_INT){
2926
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2927
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2928
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2929
                            dest, dstW, dstY);
2930
                    }else{
2931
                        RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2932
                            dest, dstW, lumAlpha, chrAlpha, dstY);
2933
                    }
2934
                }
2935
                else //general RGB
2936
                {
2937
                    if(flags & SWS_FULL_CHR_H_INT){
2938
                        yuv2rgbXinC_full(c,
2939
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2940
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2941
                            dest, dstW, dstY);
2942
                    }else{
2943
                        RENAME(yuv2packedX)(c,
2944
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2945
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2946
                            dest, dstW, dstY);
2947
                    }
2948
                }
2949
            }
2950
        }
2951
        else // hmm looks like we can't use MMX here without overwriting this array's tail
2952
        {
2953
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2954
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2955
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2956
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2957
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2958
                yuv2nv12XinC(
2959
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2960
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2961
                    dest, uDest, dstW, chrDstW, dstFormat);
2962
            }
2963
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2964
            {
2965
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2966
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2967
                yuv2yuvXinC(
2968
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2969
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2970
                    dest, uDest, vDest, dstW, chrDstW);
2971
            }
2972
            else
2973
            {
2974
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2975
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2976
                if(flags & SWS_FULL_CHR_H_INT){
2977
                    yuv2rgbXinC_full(c,
2978
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2979
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2980
                        dest, dstW, dstY);
2981
                }else{
2982
                    yuv2packedXinC(c,
2983
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2984
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2985
                        dest, dstW, dstY);
2986
                }
2987
            }
2988
        }
2989
    }
2990

    
2991
#if HAVE_MMX
2992
    __asm__ volatile(SFENCE:::"memory");
2993
    __asm__ volatile(EMMS:::"memory");
2994
#endif
2995
    /* store changed local vars back in the context */
2996
    c->dstY= dstY;
2997
    c->lumBufIndex= lumBufIndex;
2998
    c->chrBufIndex= chrBufIndex;
2999
    c->lastInLumBuf= lastInLumBuf;
3000
    c->lastInChrBuf= lastInChrBuf;
3001

    
3002
    return dstY - lastDstY;
3003
}