Statistics
| Branch: | Revision:

ffmpeg / libswscale / x86 / swscale_template.c @ 7f2ae5c7

History | View | Annotate | Download (131 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#include "swscale_template.h"
22

    
23
#undef REAL_MOVNTQ
24
#undef MOVNTQ
25
#undef PAVGB
26
#undef PREFETCH
27

    
28
#if COMPILE_TEMPLATE_AMD3DNOW
29
#define PREFETCH  "prefetch"
30
#elif COMPILE_TEMPLATE_MMX2
31
#define PREFETCH "prefetchnta"
32
#else
33
#define PREFETCH  " # nop"
34
#endif
35

    
36
#if COMPILE_TEMPLATE_MMX2
37
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
38
#elif COMPILE_TEMPLATE_AMD3DNOW
39
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
40
#endif
41

    
42
#if COMPILE_TEMPLATE_MMX2
43
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
44
#else
45
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
46
#endif
47
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
48

    
49
#define YSCALEYUV2YV12X(x, offset, dest, width) \
50
    __asm__ volatile(\
51
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
52
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
53
        "movq                             %%mm3, %%mm4      \n\t"\
54
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
55
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
56
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
57
        "1:                                                 \n\t"\
58
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
59
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
60
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
61
        "add                                $16, %%"REG_d"  \n\t"\
62
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
63
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
64
        "pmulhw                           %%mm0, %%mm2      \n\t"\
65
        "pmulhw                           %%mm0, %%mm5      \n\t"\
66
        "paddw                            %%mm2, %%mm3      \n\t"\
67
        "paddw                            %%mm5, %%mm4      \n\t"\
68
        " jnz                                1b             \n\t"\
69
        "psraw                               $3, %%mm3      \n\t"\
70
        "psraw                               $3, %%mm4      \n\t"\
71
        "packuswb                         %%mm4, %%mm3      \n\t"\
72
        MOVNTQ(%%mm3, (%1, %%REGa))\
73
        "add                                 $8, %%"REG_a"  \n\t"\
74
        "cmp                                 %2, %%"REG_a"  \n\t"\
75
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
76
        "movq                             %%mm3, %%mm4      \n\t"\
77
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
78
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
79
        "jb                                  1b             \n\t"\
80
        :: "r" (&c->redDither),\
81
        "r" (dest), "g" ((x86_reg)width)\
82
        : "%"REG_a, "%"REG_d, "%"REG_S\
83
    );
84

    
85
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
86
    __asm__ volatile(\
87
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
88
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
89
        "pxor                             %%mm4, %%mm4      \n\t"\
90
        "pxor                             %%mm5, %%mm5      \n\t"\
91
        "pxor                             %%mm6, %%mm6      \n\t"\
92
        "pxor                             %%mm7, %%mm7      \n\t"\
93
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
94
        ".p2align                             4             \n\t"\
95
        "1:                                                 \n\t"\
96
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
97
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
98
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
99
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
100
        "movq                             %%mm0, %%mm3      \n\t"\
101
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
102
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
103
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
104
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
105
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
106
        "paddd                            %%mm0, %%mm4      \n\t"\
107
        "paddd                            %%mm3, %%mm5      \n\t"\
108
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
109
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
110
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
111
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
112
        "movq                             %%mm2, %%mm0      \n\t"\
113
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
114
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
115
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
116
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
117
        "paddd                            %%mm2, %%mm6      \n\t"\
118
        "paddd                            %%mm0, %%mm7      \n\t"\
119
        " jnz                                1b             \n\t"\
120
        "psrad                              $16, %%mm4      \n\t"\
121
        "psrad                              $16, %%mm5      \n\t"\
122
        "psrad                              $16, %%mm6      \n\t"\
123
        "psrad                              $16, %%mm7      \n\t"\
124
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
125
        "packssdw                         %%mm5, %%mm4      \n\t"\
126
        "packssdw                         %%mm7, %%mm6      \n\t"\
127
        "paddw                            %%mm0, %%mm4      \n\t"\
128
        "paddw                            %%mm0, %%mm6      \n\t"\
129
        "psraw                               $3, %%mm4      \n\t"\
130
        "psraw                               $3, %%mm6      \n\t"\
131
        "packuswb                         %%mm6, %%mm4      \n\t"\
132
        MOVNTQ(%%mm4, (%1, %%REGa))\
133
        "add                                 $8, %%"REG_a"  \n\t"\
134
        "cmp                                 %2, %%"REG_a"  \n\t"\
135
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
136
        "pxor                             %%mm4, %%mm4      \n\t"\
137
        "pxor                             %%mm5, %%mm5      \n\t"\
138
        "pxor                             %%mm6, %%mm6      \n\t"\
139
        "pxor                             %%mm7, %%mm7      \n\t"\
140
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
141
        "jb                                  1b             \n\t"\
142
        :: "r" (&c->redDither),\
143
        "r" (dest), "g" ((x86_reg)width)\
144
        : "%"REG_a, "%"REG_d, "%"REG_S\
145
    );
146

    
147
#define YSCALEYUV2YV121 \
148
    "mov %2, %%"REG_a"                    \n\t"\
149
    ".p2align               4             \n\t" /* FIXME Unroll? */\
150
    "1:                                   \n\t"\
151
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
152
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
153
    "psraw                 $7, %%mm0      \n\t"\
154
    "psraw                 $7, %%mm1      \n\t"\
155
    "packuswb           %%mm1, %%mm0      \n\t"\
156
    MOVNTQ(%%mm0, (%1, %%REGa))\
157
    "add                   $8, %%"REG_a"  \n\t"\
158
    "jnc                   1b             \n\t"
159

    
160
#define YSCALEYUV2YV121_ACCURATE \
161
    "mov %2, %%"REG_a"                    \n\t"\
162
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
163
    "psrlw                 $15, %%mm7     \n\t"\
164
    "psllw                  $6, %%mm7     \n\t"\
165
    ".p2align                4            \n\t" /* FIXME Unroll? */\
166
    "1:                                   \n\t"\
167
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
168
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
169
    "paddsw             %%mm7, %%mm0      \n\t"\
170
    "paddsw             %%mm7, %%mm1      \n\t"\
171
    "psraw                 $7, %%mm0      \n\t"\
172
    "psraw                 $7, %%mm1      \n\t"\
173
    "packuswb           %%mm1, %%mm0      \n\t"\
174
    MOVNTQ(%%mm0, (%1, %%REGa))\
175
    "add                   $8, %%"REG_a"  \n\t"\
176
    "jnc                   1b             \n\t"
177

    
178
/*
179
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
180
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
181
       "r" (dest), "m" (dstW_reg),
182
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
183
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
184
*/
185
#define YSCALEYUV2PACKEDX_UV \
186
    __asm__ volatile(\
187
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
188
        ".p2align                      4                \n\t"\
189
        "nop                                            \n\t"\
190
        "1:                                             \n\t"\
191
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
192
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
193
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
194
        "movq                      %%mm3, %%mm4         \n\t"\
195
        ".p2align                      4                \n\t"\
196
        "2:                                             \n\t"\
197
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
198
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
199
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
200
        "add                         $16, %%"REG_d"     \n\t"\
201
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
202
        "pmulhw                    %%mm0, %%mm2         \n\t"\
203
        "pmulhw                    %%mm0, %%mm5         \n\t"\
204
        "paddw                     %%mm2, %%mm3         \n\t"\
205
        "paddw                     %%mm5, %%mm4         \n\t"\
206
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
207
        " jnz                         2b                \n\t"\
208

    
209
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
210
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
211
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
212
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
213
    "movq                    "#dst1", "#dst2"       \n\t"\
214
    ".p2align                      4                \n\t"\
215
    "2:                                             \n\t"\
216
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
217
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
218
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
219
    "add                         $16, %%"REG_d"            \n\t"\
220
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
221
    "pmulhw                 "#coeff", "#src1"       \n\t"\
222
    "pmulhw                 "#coeff", "#src2"       \n\t"\
223
    "paddw                   "#src1", "#dst1"       \n\t"\
224
    "paddw                   "#src2", "#dst2"       \n\t"\
225
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
226
    " jnz                         2b                \n\t"\
227

    
228
#define YSCALEYUV2PACKEDX \
229
    YSCALEYUV2PACKEDX_UV \
230
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
231

    
232
#define YSCALEYUV2PACKEDX_END                     \
233
        :: "r" (&c->redDither),                   \
234
            "m" (dummy), "m" (dummy), "m" (dummy),\
235
            "r" (dest), "m" (dstW_reg)            \
236
        : "%"REG_a, "%"REG_d, "%"REG_S            \
237
    );
238

    
239
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
240
    __asm__ volatile(\
241
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
242
        ".p2align                      4                \n\t"\
243
        "nop                                            \n\t"\
244
        "1:                                             \n\t"\
245
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
246
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
247
        "pxor                      %%mm4, %%mm4         \n\t"\
248
        "pxor                      %%mm5, %%mm5         \n\t"\
249
        "pxor                      %%mm6, %%mm6         \n\t"\
250
        "pxor                      %%mm7, %%mm7         \n\t"\
251
        ".p2align                      4                \n\t"\
252
        "2:                                             \n\t"\
253
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
254
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
255
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
256
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
257
        "movq                      %%mm0, %%mm3         \n\t"\
258
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
259
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
260
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
261
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
262
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
263
        "paddd                     %%mm0, %%mm4         \n\t"\
264
        "paddd                     %%mm3, %%mm5         \n\t"\
265
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
266
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
267
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
268
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
269
        "movq                      %%mm2, %%mm0         \n\t"\
270
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
271
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
272
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
273
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
274
        "paddd                     %%mm2, %%mm6         \n\t"\
275
        "paddd                     %%mm0, %%mm7         \n\t"\
276
        " jnz                         2b                \n\t"\
277
        "psrad                       $16, %%mm4         \n\t"\
278
        "psrad                       $16, %%mm5         \n\t"\
279
        "psrad                       $16, %%mm6         \n\t"\
280
        "psrad                       $16, %%mm7         \n\t"\
281
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
282
        "packssdw                  %%mm5, %%mm4         \n\t"\
283
        "packssdw                  %%mm7, %%mm6         \n\t"\
284
        "paddw                     %%mm0, %%mm4         \n\t"\
285
        "paddw                     %%mm0, %%mm6         \n\t"\
286
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
287
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
288

    
289
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
290
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
291
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
292
    "pxor                      %%mm1, %%mm1         \n\t"\
293
    "pxor                      %%mm5, %%mm5         \n\t"\
294
    "pxor                      %%mm7, %%mm7         \n\t"\
295
    "pxor                      %%mm6, %%mm6         \n\t"\
296
    ".p2align                      4                \n\t"\
297
    "2:                                             \n\t"\
298
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
299
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
300
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
301
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
302
    "movq                      %%mm0, %%mm3         \n\t"\
303
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
304
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
305
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
306
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
307
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
308
    "paddd                     %%mm0, %%mm1         \n\t"\
309
    "paddd                     %%mm3, %%mm5         \n\t"\
310
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
311
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
312
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
313
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
314
    "movq                      %%mm2, %%mm0         \n\t"\
315
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
316
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
317
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
318
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
319
    "paddd                     %%mm2, %%mm7         \n\t"\
320
    "paddd                     %%mm0, %%mm6         \n\t"\
321
    " jnz                         2b                \n\t"\
322
    "psrad                       $16, %%mm1         \n\t"\
323
    "psrad                       $16, %%mm5         \n\t"\
324
    "psrad                       $16, %%mm7         \n\t"\
325
    "psrad                       $16, %%mm6         \n\t"\
326
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
327
    "packssdw                  %%mm5, %%mm1         \n\t"\
328
    "packssdw                  %%mm6, %%mm7         \n\t"\
329
    "paddw                     %%mm0, %%mm1         \n\t"\
330
    "paddw                     %%mm0, %%mm7         \n\t"\
331
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
332
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
333

    
334
#define YSCALEYUV2PACKEDX_ACCURATE \
335
    YSCALEYUV2PACKEDX_ACCURATE_UV \
336
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
337

    
338
#define YSCALEYUV2RGBX \
339
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
340
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
341
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
342
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
343
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
344
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
345
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
346
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
347
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
348
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
349
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
350
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
351
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
352
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
353
    "paddw           %%mm3, %%mm4       \n\t"\
354
    "movq            %%mm2, %%mm0       \n\t"\
355
    "movq            %%mm5, %%mm6       \n\t"\
356
    "movq            %%mm4, %%mm3       \n\t"\
357
    "punpcklwd       %%mm2, %%mm2       \n\t"\
358
    "punpcklwd       %%mm5, %%mm5       \n\t"\
359
    "punpcklwd       %%mm4, %%mm4       \n\t"\
360
    "paddw           %%mm1, %%mm2       \n\t"\
361
    "paddw           %%mm1, %%mm5       \n\t"\
362
    "paddw           %%mm1, %%mm4       \n\t"\
363
    "punpckhwd       %%mm0, %%mm0       \n\t"\
364
    "punpckhwd       %%mm6, %%mm6       \n\t"\
365
    "punpckhwd       %%mm3, %%mm3       \n\t"\
366
    "paddw           %%mm7, %%mm0       \n\t"\
367
    "paddw           %%mm7, %%mm6       \n\t"\
368
    "paddw           %%mm7, %%mm3       \n\t"\
369
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
370
    "packuswb        %%mm0, %%mm2       \n\t"\
371
    "packuswb        %%mm6, %%mm5       \n\t"\
372
    "packuswb        %%mm3, %%mm4       \n\t"\
373

    
374
#define REAL_YSCALEYUV2PACKED(index, c) \
375
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
376
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
377
    "psraw                $3, %%mm0                           \n\t"\
378
    "psraw                $3, %%mm1                           \n\t"\
379
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
380
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
381
    "xor            "#index", "#index"                        \n\t"\
382
    ".p2align              4            \n\t"\
383
    "1:                                 \n\t"\
384
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
385
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
386
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
387
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
388
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
389
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
390
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
391
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
392
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
393
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
394
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
395
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
397
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
398
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
399
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
400
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
401
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
402
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
403
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
404
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
405
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
406
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
407
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
408
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
409

    
410
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
411

    
412
#define REAL_YSCALEYUV2RGB_UV(index, c) \
413
    "xor            "#index", "#index"  \n\t"\
414
    ".p2align              4            \n\t"\
415
    "1:                                 \n\t"\
416
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
417
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
418
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
419
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
420
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
421
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
422
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
423
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
424
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
425
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
426
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
427
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
428
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
429
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
430
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
431
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
432
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
433
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
434
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
435
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
436

    
437
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
438
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
439
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
440
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
441
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
442
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
443
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
444
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
445
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
446
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
447
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
448
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
449
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
450

    
451
#define REAL_YSCALEYUV2RGB_COEFF(c) \
452
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
453
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
454
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
455
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
456
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
457
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
458
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
459
    "paddw             %%mm3, %%mm4     \n\t"\
460
    "movq              %%mm2, %%mm0     \n\t"\
461
    "movq              %%mm5, %%mm6     \n\t"\
462
    "movq              %%mm4, %%mm3     \n\t"\
463
    "punpcklwd         %%mm2, %%mm2     \n\t"\
464
    "punpcklwd         %%mm5, %%mm5     \n\t"\
465
    "punpcklwd         %%mm4, %%mm4     \n\t"\
466
    "paddw             %%mm1, %%mm2     \n\t"\
467
    "paddw             %%mm1, %%mm5     \n\t"\
468
    "paddw             %%mm1, %%mm4     \n\t"\
469
    "punpckhwd         %%mm0, %%mm0     \n\t"\
470
    "punpckhwd         %%mm6, %%mm6     \n\t"\
471
    "punpckhwd         %%mm3, %%mm3     \n\t"\
472
    "paddw             %%mm7, %%mm0     \n\t"\
473
    "paddw             %%mm7, %%mm6     \n\t"\
474
    "paddw             %%mm7, %%mm3     \n\t"\
475
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
476
    "packuswb          %%mm0, %%mm2     \n\t"\
477
    "packuswb          %%mm6, %%mm5     \n\t"\
478
    "packuswb          %%mm3, %%mm4     \n\t"\
479

    
480
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
481

    
482
#define YSCALEYUV2RGB(index, c) \
483
    REAL_YSCALEYUV2RGB_UV(index, c) \
484
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
485
    REAL_YSCALEYUV2RGB_COEFF(c)
486

    
487
#define REAL_YSCALEYUV2PACKED1(index, c) \
488
    "xor            "#index", "#index"  \n\t"\
489
    ".p2align              4            \n\t"\
490
    "1:                                 \n\t"\
491
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
492
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
493
    "psraw                $7, %%mm3     \n\t" \
494
    "psraw                $7, %%mm4     \n\t" \
495
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
496
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
497
    "psraw                $7, %%mm1     \n\t" \
498
    "psraw                $7, %%mm7     \n\t" \
499

    
500
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
501

    
502
#define REAL_YSCALEYUV2RGB1(index, c) \
503
    "xor            "#index", "#index"  \n\t"\
504
    ".p2align              4            \n\t"\
505
    "1:                                 \n\t"\
506
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
507
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
508
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
509
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
510
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
511
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
512
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
513
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
514
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
515
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
516
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
517
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
518
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
519
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
520
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
521
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
522
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
523
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
524
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
525
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
526
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
527
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
528
    "paddw             %%mm3, %%mm4     \n\t"\
529
    "movq              %%mm2, %%mm0     \n\t"\
530
    "movq              %%mm5, %%mm6     \n\t"\
531
    "movq              %%mm4, %%mm3     \n\t"\
532
    "punpcklwd         %%mm2, %%mm2     \n\t"\
533
    "punpcklwd         %%mm5, %%mm5     \n\t"\
534
    "punpcklwd         %%mm4, %%mm4     \n\t"\
535
    "paddw             %%mm1, %%mm2     \n\t"\
536
    "paddw             %%mm1, %%mm5     \n\t"\
537
    "paddw             %%mm1, %%mm4     \n\t"\
538
    "punpckhwd         %%mm0, %%mm0     \n\t"\
539
    "punpckhwd         %%mm6, %%mm6     \n\t"\
540
    "punpckhwd         %%mm3, %%mm3     \n\t"\
541
    "paddw             %%mm7, %%mm0     \n\t"\
542
    "paddw             %%mm7, %%mm6     \n\t"\
543
    "paddw             %%mm7, %%mm3     \n\t"\
544
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
545
    "packuswb          %%mm0, %%mm2     \n\t"\
546
    "packuswb          %%mm6, %%mm5     \n\t"\
547
    "packuswb          %%mm3, %%mm4     \n\t"\
548

    
549
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
550

    
551
#define REAL_YSCALEYUV2PACKED1b(index, c) \
552
    "xor "#index", "#index"             \n\t"\
553
    ".p2align              4            \n\t"\
554
    "1:                                 \n\t"\
555
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
556
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
557
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
558
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
559
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
560
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
561
    "psrlw                $8, %%mm3     \n\t" \
562
    "psrlw                $8, %%mm4     \n\t" \
563
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
564
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
565
    "psraw                $7, %%mm1     \n\t" \
566
    "psraw                $7, %%mm7     \n\t"
567
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
568

    
569
// do vertical chrominance interpolation
570
#define REAL_YSCALEYUV2RGB1b(index, c) \
571
    "xor            "#index", "#index"  \n\t"\
572
    ".p2align              4            \n\t"\
573
    "1:                                 \n\t"\
574
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
575
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
576
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
577
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
578
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
579
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
580
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
581
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
582
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
583
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
584
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
585
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
586
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
587
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
588
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
589
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
590
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
591
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
592
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
593
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
594
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
595
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
596
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
597
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
598
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
599
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
600
    "paddw             %%mm3, %%mm4     \n\t"\
601
    "movq              %%mm2, %%mm0     \n\t"\
602
    "movq              %%mm5, %%mm6     \n\t"\
603
    "movq              %%mm4, %%mm3     \n\t"\
604
    "punpcklwd         %%mm2, %%mm2     \n\t"\
605
    "punpcklwd         %%mm5, %%mm5     \n\t"\
606
    "punpcklwd         %%mm4, %%mm4     \n\t"\
607
    "paddw             %%mm1, %%mm2     \n\t"\
608
    "paddw             %%mm1, %%mm5     \n\t"\
609
    "paddw             %%mm1, %%mm4     \n\t"\
610
    "punpckhwd         %%mm0, %%mm0     \n\t"\
611
    "punpckhwd         %%mm6, %%mm6     \n\t"\
612
    "punpckhwd         %%mm3, %%mm3     \n\t"\
613
    "paddw             %%mm7, %%mm0     \n\t"\
614
    "paddw             %%mm7, %%mm6     \n\t"\
615
    "paddw             %%mm7, %%mm3     \n\t"\
616
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
617
    "packuswb          %%mm0, %%mm2     \n\t"\
618
    "packuswb          %%mm6, %%mm5     \n\t"\
619
    "packuswb          %%mm3, %%mm4     \n\t"\
620

    
621
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
622

    
623
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
624
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
625
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
626
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
627
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
628
    "packuswb          %%mm1, %%mm7     \n\t"
629
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
630

    
631
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
632
    "movq       "#b", "#q2"     \n\t" /* B */\
633
    "movq       "#r", "#t"      \n\t" /* R */\
634
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
635
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
636
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
637
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
638
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
639
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
640
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
641
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
642
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
643
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
644
\
645
    MOVNTQ(   q0,   (dst, index, 4))\
646
    MOVNTQ(    b,  8(dst, index, 4))\
647
    MOVNTQ(   q2, 16(dst, index, 4))\
648
    MOVNTQ(   q3, 24(dst, index, 4))\
649
\
650
    "add      $8, "#index"      \n\t"\
651
    "cmp "#dstw", "#index"      \n\t"\
652
    " jb      1b                \n\t"
653
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
654

    
655
#define REAL_WRITERGB16(dst, dstw, index) \
656
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
657
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
658
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
659
    "psrlq           $3, %%mm2  \n\t"\
660
\
661
    "movq         %%mm2, %%mm1  \n\t"\
662
    "movq         %%mm4, %%mm3  \n\t"\
663
\
664
    "punpcklbw    %%mm7, %%mm3  \n\t"\
665
    "punpcklbw    %%mm5, %%mm2  \n\t"\
666
    "punpckhbw    %%mm7, %%mm4  \n\t"\
667
    "punpckhbw    %%mm5, %%mm1  \n\t"\
668
\
669
    "psllq           $3, %%mm3  \n\t"\
670
    "psllq           $3, %%mm4  \n\t"\
671
\
672
    "por          %%mm3, %%mm2  \n\t"\
673
    "por          %%mm4, %%mm1  \n\t"\
674
\
675
    MOVNTQ(%%mm2,  (dst, index, 2))\
676
    MOVNTQ(%%mm1, 8(dst, index, 2))\
677
\
678
    "add             $8, "#index"   \n\t"\
679
    "cmp        "#dstw", "#index"   \n\t"\
680
    " jb             1b             \n\t"
681
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
682

    
683
#define REAL_WRITERGB15(dst, dstw, index) \
684
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
685
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
686
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
687
    "psrlq           $3, %%mm2  \n\t"\
688
    "psrlq           $1, %%mm5  \n\t"\
689
\
690
    "movq         %%mm2, %%mm1  \n\t"\
691
    "movq         %%mm4, %%mm3  \n\t"\
692
\
693
    "punpcklbw    %%mm7, %%mm3  \n\t"\
694
    "punpcklbw    %%mm5, %%mm2  \n\t"\
695
    "punpckhbw    %%mm7, %%mm4  \n\t"\
696
    "punpckhbw    %%mm5, %%mm1  \n\t"\
697
\
698
    "psllq           $2, %%mm3  \n\t"\
699
    "psllq           $2, %%mm4  \n\t"\
700
\
701
    "por          %%mm3, %%mm2  \n\t"\
702
    "por          %%mm4, %%mm1  \n\t"\
703
\
704
    MOVNTQ(%%mm2,  (dst, index, 2))\
705
    MOVNTQ(%%mm1, 8(dst, index, 2))\
706
\
707
    "add             $8, "#index"   \n\t"\
708
    "cmp        "#dstw", "#index"   \n\t"\
709
    " jb             1b             \n\t"
710
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
711

    
712
#define WRITEBGR24OLD(dst, dstw, index) \
713
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
714
    "movq      %%mm2, %%mm1             \n\t" /* B */\
715
    "movq      %%mm5, %%mm6             \n\t" /* R */\
716
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
717
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
718
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
719
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
720
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
721
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
722
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
723
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
724
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
725
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
726
\
727
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
728
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
729
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
730
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
731
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
732
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
733
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
734
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
735
\
736
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
737
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
738
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
739
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
740
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
741
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
742
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
743
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
744
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
745
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
746
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
747
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
748
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
749
\
750
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
751
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
752
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
753
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
754
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
755
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
756
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
757
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
758
\
759
    MOVNTQ(%%mm0,   (dst))\
760
    MOVNTQ(%%mm2,  8(dst))\
761
    MOVNTQ(%%mm3, 16(dst))\
762
    "add         $24, "#dst"            \n\t"\
763
\
764
    "add          $8, "#index"          \n\t"\
765
    "cmp     "#dstw", "#index"          \n\t"\
766
    " jb          1b                    \n\t"
767

    
768
#define WRITEBGR24MMX(dst, dstw, index) \
769
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
770
    "movq      %%mm2, %%mm1     \n\t" /* B */\
771
    "movq      %%mm5, %%mm6     \n\t" /* R */\
772
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
773
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
774
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
775
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
776
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
777
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
778
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
779
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
780
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
781
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
782
\
783
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
784
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
785
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
786
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
787
\
788
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
789
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
790
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
791
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
792
\
793
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
794
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
795
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
796
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
797
\
798
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
799
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
800
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
801
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
802
    MOVNTQ(%%mm0, (dst))\
803
\
804
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
805
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
806
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
807
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
808
    MOVNTQ(%%mm6, 8(dst))\
809
\
810
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
811
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
812
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
813
    MOVNTQ(%%mm5, 16(dst))\
814
\
815
    "add         $24, "#dst"    \n\t"\
816
\
817
    "add          $8, "#index"  \n\t"\
818
    "cmp     "#dstw", "#index"  \n\t"\
819
    " jb          1b            \n\t"
820

    
821
#define WRITEBGR24MMX2(dst, dstw, index) \
822
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
824
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
825
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
826
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
827
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
828
\
829
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
830
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
831
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
832
\
833
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
834
    "por    %%mm1, %%mm6        \n\t"\
835
    "por    %%mm3, %%mm6        \n\t"\
836
    MOVNTQ(%%mm6, (dst))\
837
\
838
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
839
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
840
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
841
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
842
\
843
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
844
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
845
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
846
\
847
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
848
    "por    %%mm3, %%mm6        \n\t"\
849
    MOVNTQ(%%mm6, 8(dst))\
850
\
851
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
852
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
853
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
854
\
855
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
856
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
857
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
858
\
859
    "por    %%mm1, %%mm3        \n\t"\
860
    "por    %%mm3, %%mm6        \n\t"\
861
    MOVNTQ(%%mm6, 16(dst))\
862
\
863
    "add      $24, "#dst"       \n\t"\
864
\
865
    "add       $8, "#index"     \n\t"\
866
    "cmp  "#dstw", "#index"     \n\t"\
867
    " jb       1b               \n\t"
868

    
869
#if COMPILE_TEMPLATE_MMX2
870
#undef WRITEBGR24
871
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
872
#else
873
#undef WRITEBGR24
874
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
875
#endif
876

    
877
#define REAL_WRITEYUY2(dst, dstw, index) \
878
    "packuswb  %%mm3, %%mm3     \n\t"\
879
    "packuswb  %%mm4, %%mm4     \n\t"\
880
    "packuswb  %%mm7, %%mm1     \n\t"\
881
    "punpcklbw %%mm4, %%mm3     \n\t"\
882
    "movq      %%mm1, %%mm7     \n\t"\
883
    "punpcklbw %%mm3, %%mm1     \n\t"\
884
    "punpckhbw %%mm3, %%mm7     \n\t"\
885
\
886
    MOVNTQ(%%mm1, (dst, index, 2))\
887
    MOVNTQ(%%mm7, 8(dst, index, 2))\
888
\
889
    "add          $8, "#index"  \n\t"\
890
    "cmp     "#dstw", "#index"  \n\t"\
891
    " jb          1b            \n\t"
892
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
893

    
894

    
895
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
896
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
897
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
898
{
899
    if(!(c->flags & SWS_BITEXACT)) {
900
        if (c->flags & SWS_ACCURATE_RND) {
901
            if (uDest) {
902
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
903
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
904
            }
905
            if (CONFIG_SWSCALE_ALPHA && aDest) {
906
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
907
            }
908

    
909
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
910
        } else {
911
            if (uDest) {
912
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914
            }
915
            if (CONFIG_SWSCALE_ALPHA && aDest) {
916
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917
            }
918

    
919
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920
        }
921
        return;
922
    }
923
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
924
                chrFilter, chrSrc, chrFilterSize,
925
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
926
}
927

    
928
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
929
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
930
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
931
{
932
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
933
                 chrFilter, chrSrc, chrFilterSize,
934
                 dest, uDest, dstW, chrDstW, dstFormat);
935
}
936

    
937
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
938
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
939
{
940
    int i;
941
    if(!(c->flags & SWS_BITEXACT)) {
942
        long p= 4;
943
        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
944
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
945
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
946

    
947
        if (c->flags & SWS_ACCURATE_RND) {
948
            while(p--) {
949
                if (dst[p]) {
950
                    __asm__ volatile(
951
                        YSCALEYUV2YV121_ACCURATE
952
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
953
                        "g" (-counter[p])
954
                        : "%"REG_a
955
                    );
956
                }
957
            }
958
        } else {
959
            while(p--) {
960
                if (dst[p]) {
961
                    __asm__ volatile(
962
                        YSCALEYUV2YV121
963
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
964
                        "g" (-counter[p])
965
                        : "%"REG_a
966
                    );
967
                }
968
            }
969
        }
970
        return;
971
    }
972
    for (i=0; i<dstW; i++) {
973
        int val= (lumSrc[i]+64)>>7;
974

    
975
        if (val&256) {
976
            if (val<0) val=0;
977
            else       val=255;
978
        }
979

    
980
        dest[i]= val;
981
    }
982

    
983
    if (uDest)
984
        for (i=0; i<chrDstW; i++) {
985
            int u=(chrSrc[i       ]+64)>>7;
986
            int v=(chrSrc[i + VOFW]+64)>>7;
987

    
988
            if ((u|v)&256) {
989
                if (u<0)        u=0;
990
                else if (u>255) u=255;
991
                if (v<0)        v=0;
992
                else if (v>255) v=255;
993
            }
994

    
995
            uDest[i]= u;
996
            vDest[i]= v;
997
        }
998

    
999
    if (CONFIG_SWSCALE_ALPHA && aDest)
1000
        for (i=0; i<dstW; i++) {
1001
            int val= (alpSrc[i]+64)>>7;
1002
            aDest[i]= av_clip_uint8(val);
1003
        }
1004
}
1005

    
1006

    
1007
/**
1008
 * vertical scale YV12 to RGB
1009
 */
1010
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1011
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1012
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1013
{
1014
    x86_reg dummy=0;
1015
    x86_reg dstW_reg = dstW;
1016
    if(!(c->flags & SWS_BITEXACT)) {
1017
        if (c->flags & SWS_ACCURATE_RND) {
1018
            switch(c->dstFormat) {
1019
            case PIX_FMT_RGB32:
1020
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1021
                    YSCALEYUV2PACKEDX_ACCURATE
1022
                    YSCALEYUV2RGBX
1023
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1024
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1025
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1026
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1027
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1028
                    "psraw                        $3, %%mm1         \n\t"
1029
                    "psraw                        $3, %%mm7         \n\t"
1030
                    "packuswb                  %%mm7, %%mm1         \n\t"
1031
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1032

    
1033
                    YSCALEYUV2PACKEDX_END
1034
                } else {
1035
                    YSCALEYUV2PACKEDX_ACCURATE
1036
                    YSCALEYUV2RGBX
1037
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1038
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1039

    
1040
                    YSCALEYUV2PACKEDX_END
1041
                }
1042
                return;
1043
            case PIX_FMT_BGR24:
1044
                YSCALEYUV2PACKEDX_ACCURATE
1045
                YSCALEYUV2RGBX
1046
                "pxor %%mm7, %%mm7 \n\t"
1047
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1048
                "add %4, %%"REG_c"                        \n\t"
1049
                WRITEBGR24(%%REGc, %5, %%REGa)
1050

    
1051

    
1052
                :: "r" (&c->redDither),
1053
                "m" (dummy), "m" (dummy), "m" (dummy),
1054
                "r" (dest), "m" (dstW_reg)
1055
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1056
                );
1057
                return;
1058
            case PIX_FMT_RGB555:
1059
                YSCALEYUV2PACKEDX_ACCURATE
1060
                YSCALEYUV2RGBX
1061
                "pxor %%mm7, %%mm7 \n\t"
1062
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1063
#ifdef DITHER1XBPP
1064
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1065
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1066
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1067
#endif
1068

    
1069
                WRITERGB15(%4, %5, %%REGa)
1070
                YSCALEYUV2PACKEDX_END
1071
                return;
1072
            case PIX_FMT_RGB565:
1073
                YSCALEYUV2PACKEDX_ACCURATE
1074
                YSCALEYUV2RGBX
1075
                "pxor %%mm7, %%mm7 \n\t"
1076
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1077
#ifdef DITHER1XBPP
1078
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1079
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1080
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1081
#endif
1082

    
1083
                WRITERGB16(%4, %5, %%REGa)
1084
                YSCALEYUV2PACKEDX_END
1085
                return;
1086
            case PIX_FMT_YUYV422:
1087
                YSCALEYUV2PACKEDX_ACCURATE
1088
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089

    
1090
                "psraw $3, %%mm3    \n\t"
1091
                "psraw $3, %%mm4    \n\t"
1092
                "psraw $3, %%mm1    \n\t"
1093
                "psraw $3, %%mm7    \n\t"
1094
                WRITEYUY2(%4, %5, %%REGa)
1095
                YSCALEYUV2PACKEDX_END
1096
                return;
1097
            }
1098
        } else {
1099
            switch(c->dstFormat) {
1100
            case PIX_FMT_RGB32:
1101
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1102
                    YSCALEYUV2PACKEDX
1103
                    YSCALEYUV2RGBX
1104
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1105
                    "psraw                        $3, %%mm1         \n\t"
1106
                    "psraw                        $3, %%mm7         \n\t"
1107
                    "packuswb                  %%mm7, %%mm1         \n\t"
1108
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1109
                    YSCALEYUV2PACKEDX_END
1110
                } else {
1111
                    YSCALEYUV2PACKEDX
1112
                    YSCALEYUV2RGBX
1113
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1114
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1115
                    YSCALEYUV2PACKEDX_END
1116
                }
1117
                return;
1118
            case PIX_FMT_BGR24:
1119
                YSCALEYUV2PACKEDX
1120
                YSCALEYUV2RGBX
1121
                "pxor                    %%mm7, %%mm7       \n\t"
1122
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1123
                "add                        %4, %%"REG_c"   \n\t"
1124
                WRITEBGR24(%%REGc, %5, %%REGa)
1125

    
1126
                :: "r" (&c->redDither),
1127
                "m" (dummy), "m" (dummy), "m" (dummy),
1128
                "r" (dest),  "m" (dstW_reg)
1129
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1130
                );
1131
                return;
1132
            case PIX_FMT_RGB555:
1133
                YSCALEYUV2PACKEDX
1134
                YSCALEYUV2RGBX
1135
                "pxor %%mm7, %%mm7 \n\t"
1136
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1137
#ifdef DITHER1XBPP
1138
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1139
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1140
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1141
#endif
1142

    
1143
                WRITERGB15(%4, %5, %%REGa)
1144
                YSCALEYUV2PACKEDX_END
1145
                return;
1146
            case PIX_FMT_RGB565:
1147
                YSCALEYUV2PACKEDX
1148
                YSCALEYUV2RGBX
1149
                "pxor %%mm7, %%mm7 \n\t"
1150
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1151
#ifdef DITHER1XBPP
1152
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1153
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1154
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1155
#endif
1156

    
1157
                WRITERGB16(%4, %5, %%REGa)
1158
                YSCALEYUV2PACKEDX_END
1159
                return;
1160
            case PIX_FMT_YUYV422:
1161
                YSCALEYUV2PACKEDX
1162
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1163

    
1164
                "psraw $3, %%mm3    \n\t"
1165
                "psraw $3, %%mm4    \n\t"
1166
                "psraw $3, %%mm1    \n\t"
1167
                "psraw $3, %%mm7    \n\t"
1168
                WRITEYUY2(%4, %5, %%REGa)
1169
                YSCALEYUV2PACKEDX_END
1170
                return;
1171
            }
1172
        }
1173
    }
1174
    yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1175
                   chrFilter, chrSrc, chrFilterSize,
1176
                   alpSrc, dest, dstW, dstY);
1177
}
1178

    
1179
/**
1180
 * vertical bilinear scale YV12 to RGB
1181
 */
1182
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1183
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1184
{
1185
    int  yalpha1=4095- yalpha;
1186
    int uvalpha1=4095-uvalpha;
1187
    int i;
1188

    
1189
    if(!(c->flags & SWS_BITEXACT)) {
1190
        switch(c->dstFormat) {
1191
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1192
        case PIX_FMT_RGB32:
1193
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1194
#if ARCH_X86_64
1195
                __asm__ volatile(
1196
                    YSCALEYUV2RGB(%%r8, %5)
1197
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1198
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1199
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1200
                    "packuswb            %%mm7, %%mm1       \n\t"
1201
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1202

    
1203
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1204
                    "a" (&c->redDither)
1205
                    ,"r" (abuf0), "r" (abuf1)
1206
                    : "%r8"
1207
                );
1208
#else
1209
                *(const uint16_t **)(&c->u_temp)=abuf0;
1210
                *(const uint16_t **)(&c->v_temp)=abuf1;
1211
                __asm__ volatile(
1212
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1213
                    "mov        %4, %%"REG_b"               \n\t"
1214
                    "push %%"REG_BP"                        \n\t"
1215
                    YSCALEYUV2RGB(%%REGBP, %5)
1216
                    "push                   %0              \n\t"
1217
                    "push                   %1              \n\t"
1218
                    "mov          "U_TEMP"(%5), %0          \n\t"
1219
                    "mov          "V_TEMP"(%5), %1          \n\t"
1220
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1221
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1222
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1223
                    "packuswb            %%mm7, %%mm1       \n\t"
1224
                    "pop                    %1              \n\t"
1225
                    "pop                    %0              \n\t"
1226
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1227
                    "pop %%"REG_BP"                         \n\t"
1228
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1229

    
1230
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1231
                    "a" (&c->redDither)
1232
                );
1233
#endif
1234
            } else {
1235
                __asm__ volatile(
1236
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1237
                    "mov        %4, %%"REG_b"               \n\t"
1238
                    "push %%"REG_BP"                        \n\t"
1239
                    YSCALEYUV2RGB(%%REGBP, %5)
1240
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1241
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1242
                    "pop %%"REG_BP"                         \n\t"
1243
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1244

    
1245
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1246
                    "a" (&c->redDither)
1247
                );
1248
            }
1249
            return;
1250
        case PIX_FMT_BGR24:
1251
            __asm__ volatile(
1252
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1253
                "mov        %4, %%"REG_b"               \n\t"
1254
                "push %%"REG_BP"                        \n\t"
1255
                YSCALEYUV2RGB(%%REGBP, %5)
1256
                "pxor    %%mm7, %%mm7                   \n\t"
1257
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1258
                "pop %%"REG_BP"                         \n\t"
1259
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1260
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1261
                "a" (&c->redDither)
1262
            );
1263
            return;
1264
        case PIX_FMT_RGB555:
1265
            __asm__ volatile(
1266
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1267
                "mov        %4, %%"REG_b"               \n\t"
1268
                "push %%"REG_BP"                        \n\t"
1269
                YSCALEYUV2RGB(%%REGBP, %5)
1270
                "pxor    %%mm7, %%mm7                   \n\t"
1271
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1272
#ifdef DITHER1XBPP
1273
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1274
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1275
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1276
#endif
1277

    
1278
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1279
                "pop %%"REG_BP"                         \n\t"
1280
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1281

    
1282
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283
                "a" (&c->redDither)
1284
            );
1285
            return;
1286
        case PIX_FMT_RGB565:
1287
            __asm__ volatile(
1288
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1289
                "mov        %4, %%"REG_b"               \n\t"
1290
                "push %%"REG_BP"                        \n\t"
1291
                YSCALEYUV2RGB(%%REGBP, %5)
1292
                "pxor    %%mm7, %%mm7                   \n\t"
1293
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1294
#ifdef DITHER1XBPP
1295
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1296
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1297
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1298
#endif
1299

    
1300
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1301
                "pop %%"REG_BP"                         \n\t"
1302
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1303
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1304
                "a" (&c->redDither)
1305
            );
1306
            return;
1307
        case PIX_FMT_YUYV422:
1308
            __asm__ volatile(
1309
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1310
                "mov %4, %%"REG_b"                        \n\t"
1311
                "push %%"REG_BP"                        \n\t"
1312
                YSCALEYUV2PACKED(%%REGBP, %5)
1313
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1314
                "pop %%"REG_BP"                         \n\t"
1315
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1316
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1317
                "a" (&c->redDither)
1318
            );
1319
            return;
1320
        default: break;
1321
        }
1322
    }
1323
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1324
}
1325

    
1326
/**
1327
 * YV12 to RGB without scaling or interpolating
1328
 */
1329
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1330
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1331
{
1332
    const int yalpha1=0;
1333
    int i;
1334

    
1335
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1336
    const int yalpha= 4096; //FIXME ...
1337

    
1338
    if (flags&SWS_FULL_CHR_H_INT) {
1339
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1340
        return;
1341
    }
1342

    
1343
    if(!(flags & SWS_BITEXACT)) {
1344
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1345
            switch(dstFormat) {
1346
            case PIX_FMT_RGB32:
1347
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1348
                    __asm__ volatile(
1349
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1350
                        "mov        %4, %%"REG_b"               \n\t"
1351
                        "push %%"REG_BP"                        \n\t"
1352
                        YSCALEYUV2RGB1(%%REGBP, %5)
1353
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1354
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1355
                        "pop %%"REG_BP"                         \n\t"
1356
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1357

    
1358
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1359
                        "a" (&c->redDither)
1360
                    );
1361
                } else {
1362
                    __asm__ volatile(
1363
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1364
                        "mov        %4, %%"REG_b"               \n\t"
1365
                        "push %%"REG_BP"                        \n\t"
1366
                        YSCALEYUV2RGB1(%%REGBP, %5)
1367
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1368
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1369
                        "pop %%"REG_BP"                         \n\t"
1370
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1371

    
1372
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1373
                        "a" (&c->redDither)
1374
                    );
1375
                }
1376
                return;
1377
            case PIX_FMT_BGR24:
1378
                __asm__ volatile(
1379
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1380
                    "mov        %4, %%"REG_b"               \n\t"
1381
                    "push %%"REG_BP"                        \n\t"
1382
                    YSCALEYUV2RGB1(%%REGBP, %5)
1383
                    "pxor    %%mm7, %%mm7                   \n\t"
1384
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1385
                    "pop %%"REG_BP"                         \n\t"
1386
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1387

    
1388
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1389
                    "a" (&c->redDither)
1390
                );
1391
                return;
1392
            case PIX_FMT_RGB555:
1393
                __asm__ volatile(
1394
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1395
                    "mov        %4, %%"REG_b"               \n\t"
1396
                    "push %%"REG_BP"                        \n\t"
1397
                    YSCALEYUV2RGB1(%%REGBP, %5)
1398
                    "pxor    %%mm7, %%mm7                   \n\t"
1399
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1400
#ifdef DITHER1XBPP
1401
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1402
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1403
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1404
#endif
1405
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1406
                    "pop %%"REG_BP"                         \n\t"
1407
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1408

    
1409
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1410
                    "a" (&c->redDither)
1411
                );
1412
                return;
1413
            case PIX_FMT_RGB565:
1414
                __asm__ volatile(
1415
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1416
                    "mov        %4, %%"REG_b"               \n\t"
1417
                    "push %%"REG_BP"                        \n\t"
1418
                    YSCALEYUV2RGB1(%%REGBP, %5)
1419
                    "pxor    %%mm7, %%mm7                   \n\t"
1420
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1421
#ifdef DITHER1XBPP
1422
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1423
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1424
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1425
#endif
1426

    
1427
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1428
                    "pop %%"REG_BP"                         \n\t"
1429
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1430

    
1431
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1432
                    "a" (&c->redDither)
1433
                );
1434
                return;
1435
            case PIX_FMT_YUYV422:
1436
                __asm__ volatile(
1437
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1438
                    "mov        %4, %%"REG_b"               \n\t"
1439
                    "push %%"REG_BP"                        \n\t"
1440
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1441
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1442
                    "pop %%"REG_BP"                         \n\t"
1443
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1444

    
1445
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1446
                    "a" (&c->redDither)
1447
                );
1448
                return;
1449
            }
1450
        } else {
1451
            switch(dstFormat) {
1452
            case PIX_FMT_RGB32:
1453
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1454
                    __asm__ volatile(
1455
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1456
                        "mov        %4, %%"REG_b"               \n\t"
1457
                        "push %%"REG_BP"                        \n\t"
1458
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1459
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1460
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1461
                        "pop %%"REG_BP"                         \n\t"
1462
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1463

    
1464
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1465
                        "a" (&c->redDither)
1466
                    );
1467
                } else {
1468
                    __asm__ volatile(
1469
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1470
                        "mov        %4, %%"REG_b"               \n\t"
1471
                        "push %%"REG_BP"                        \n\t"
1472
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1473
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1474
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1475
                        "pop %%"REG_BP"                         \n\t"
1476
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1477

    
1478
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1479
                        "a" (&c->redDither)
1480
                    );
1481
                }
1482
                return;
1483
            case PIX_FMT_BGR24:
1484
                __asm__ volatile(
1485
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1486
                    "mov        %4, %%"REG_b"               \n\t"
1487
                    "push %%"REG_BP"                        \n\t"
1488
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1489
                    "pxor    %%mm7, %%mm7                   \n\t"
1490
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1491
                    "pop %%"REG_BP"                         \n\t"
1492
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1493

    
1494
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495
                    "a" (&c->redDither)
1496
                );
1497
                return;
1498
            case PIX_FMT_RGB555:
1499
                __asm__ volatile(
1500
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                    "mov        %4, %%"REG_b"               \n\t"
1502
                    "push %%"REG_BP"                        \n\t"
1503
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1504
                    "pxor    %%mm7, %%mm7                   \n\t"
1505
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1506
#ifdef DITHER1XBPP
1507
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1508
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1509
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1510
#endif
1511
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1512
                    "pop %%"REG_BP"                         \n\t"
1513
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1514

    
1515
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1516
                    "a" (&c->redDither)
1517
                );
1518
                return;
1519
            case PIX_FMT_RGB565:
1520
                __asm__ volatile(
1521
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1522
                    "mov        %4, %%"REG_b"               \n\t"
1523
                    "push %%"REG_BP"                        \n\t"
1524
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1525
                    "pxor    %%mm7, %%mm7                   \n\t"
1526
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1527
#ifdef DITHER1XBPP
1528
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1529
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1530
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1531
#endif
1532

    
1533
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1534
                    "pop %%"REG_BP"                         \n\t"
1535
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1536

    
1537
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1538
                    "a" (&c->redDither)
1539
                );
1540
                return;
1541
            case PIX_FMT_YUYV422:
1542
                __asm__ volatile(
1543
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1544
                    "mov        %4, %%"REG_b"               \n\t"
1545
                    "push %%"REG_BP"                        \n\t"
1546
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1547
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1548
                    "pop %%"REG_BP"                         \n\t"
1549
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1550

    
1551
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1552
                    "a" (&c->redDither)
1553
                );
1554
                return;
1555
            }
1556
        }
1557
    }
1558
    if (uvalpha < 2048) {
1559
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1560
    } else {
1561
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1562
    }
1563
}
1564

    
1565
//FIXME yuy2* can read up to 7 samples too much
1566

    
1567
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1568
{
1569
    __asm__ volatile(
1570
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1571
        "mov                    %0, %%"REG_a"       \n\t"
1572
        "1:                                         \n\t"
1573
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1574
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1575
        "pand                %%mm2, %%mm0           \n\t"
1576
        "pand                %%mm2, %%mm1           \n\t"
1577
        "packuswb            %%mm1, %%mm0           \n\t"
1578
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1579
        "add                    $8, %%"REG_a"       \n\t"
1580
        " js                    1b                  \n\t"
1581
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1582
        : "%"REG_a
1583
    );
1584
}
1585

    
1586
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1587
{
1588
    __asm__ volatile(
1589
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1590
        "mov                    %0, %%"REG_a"       \n\t"
1591
        "1:                                         \n\t"
1592
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1593
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1594
        "psrlw                  $8, %%mm0           \n\t"
1595
        "psrlw                  $8, %%mm1           \n\t"
1596
        "packuswb            %%mm1, %%mm0           \n\t"
1597
        "movq                %%mm0, %%mm1           \n\t"
1598
        "psrlw                  $8, %%mm0           \n\t"
1599
        "pand                %%mm4, %%mm1           \n\t"
1600
        "packuswb            %%mm0, %%mm0           \n\t"
1601
        "packuswb            %%mm1, %%mm1           \n\t"
1602
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1603
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1604
        "add                    $4, %%"REG_a"       \n\t"
1605
        " js                    1b                  \n\t"
1606
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1607
        : "%"REG_a
1608
    );
1609
    assert(src1 == src2);
1610
}
1611

    
1612
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1613
{
1614
    __asm__ volatile(
1615
        "mov                    %0, %%"REG_a"       \n\t"
1616
        "1:                                         \n\t"
1617
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1618
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1619
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1620
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1621
        "psrlw                  $8, %%mm0           \n\t"
1622
        "psrlw                  $8, %%mm1           \n\t"
1623
        "psrlw                  $8, %%mm2           \n\t"
1624
        "psrlw                  $8, %%mm3           \n\t"
1625
        "packuswb            %%mm1, %%mm0           \n\t"
1626
        "packuswb            %%mm3, %%mm2           \n\t"
1627
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1628
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1629
        "add                    $8, %%"REG_a"       \n\t"
1630
        " js                    1b                  \n\t"
1631
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1632
        : "%"REG_a
1633
    );
1634
}
1635

    
1636
/* This is almost identical to the previous, end exists only because
1637
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1638
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1639
{
1640
    __asm__ volatile(
1641
        "mov                  %0, %%"REG_a"         \n\t"
1642
        "1:                                         \n\t"
1643
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1644
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1645
        "psrlw                $8, %%mm0             \n\t"
1646
        "psrlw                $8, %%mm1             \n\t"
1647
        "packuswb          %%mm1, %%mm0             \n\t"
1648
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1649
        "add                  $8, %%"REG_a"         \n\t"
1650
        " js                  1b                    \n\t"
1651
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1652
        : "%"REG_a
1653
    );
1654
}
1655

    
1656
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1657
{
1658
    __asm__ volatile(
1659
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1660
        "mov                    %0, %%"REG_a"       \n\t"
1661
        "1:                                         \n\t"
1662
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1663
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1664
        "pand                %%mm4, %%mm0           \n\t"
1665
        "pand                %%mm4, %%mm1           \n\t"
1666
        "packuswb            %%mm1, %%mm0           \n\t"
1667
        "movq                %%mm0, %%mm1           \n\t"
1668
        "psrlw                  $8, %%mm0           \n\t"
1669
        "pand                %%mm4, %%mm1           \n\t"
1670
        "packuswb            %%mm0, %%mm0           \n\t"
1671
        "packuswb            %%mm1, %%mm1           \n\t"
1672
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1673
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1674
        "add                    $4, %%"REG_a"       \n\t"
1675
        " js                    1b                  \n\t"
1676
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1677
        : "%"REG_a
1678
    );
1679
    assert(src1 == src2);
1680
}
1681

    
1682
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1683
{
1684
    __asm__ volatile(
1685
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1686
        "mov                    %0, %%"REG_a"       \n\t"
1687
        "1:                                         \n\t"
1688
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1689
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1690
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1691
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1692
        "pand                %%mm4, %%mm0           \n\t"
1693
        "pand                %%mm4, %%mm1           \n\t"
1694
        "pand                %%mm4, %%mm2           \n\t"
1695
        "pand                %%mm4, %%mm3           \n\t"
1696
        "packuswb            %%mm1, %%mm0           \n\t"
1697
        "packuswb            %%mm3, %%mm2           \n\t"
1698
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1699
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1700
        "add                    $8, %%"REG_a"       \n\t"
1701
        " js                    1b                  \n\t"
1702
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1703
        : "%"REG_a
1704
    );
1705
}
1706

    
1707
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1708
                                    const uint8_t *src, long width)
1709
{
1710
    __asm__ volatile(
1711
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1712
        "mov                    %0, %%"REG_a"       \n\t"
1713
        "1:                                         \n\t"
1714
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1715
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1716
        "movq                %%mm0, %%mm2           \n\t"
1717
        "movq                %%mm1, %%mm3           \n\t"
1718
        "pand                %%mm4, %%mm0           \n\t"
1719
        "pand                %%mm4, %%mm1           \n\t"
1720
        "psrlw                  $8, %%mm2           \n\t"
1721
        "psrlw                  $8, %%mm3           \n\t"
1722
        "packuswb            %%mm1, %%mm0           \n\t"
1723
        "packuswb            %%mm3, %%mm2           \n\t"
1724
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1725
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1726
        "add                    $8, %%"REG_a"       \n\t"
1727
        " js                    1b                  \n\t"
1728
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1729
        : "%"REG_a
1730
    );
1731
}
1732

    
1733
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1734
                                    const uint8_t *src1, const uint8_t *src2,
1735
                                    long width, uint32_t *unused)
1736
{
1737
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1738
}
1739

    
1740
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1741
                                    const uint8_t *src1, const uint8_t *src2,
1742
                                    long width, uint32_t *unused)
1743
{
1744
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1745
}
1746

    
1747
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1748
{
1749

    
1750
    if(srcFormat == PIX_FMT_BGR24) {
1751
        __asm__ volatile(
1752
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1753
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1754
            :
1755
        );
1756
    } else {
1757
        __asm__ volatile(
1758
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1759
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1760
            :
1761
        );
1762
    }
1763

    
1764
    __asm__ volatile(
1765
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1766
        "mov                        %2, %%"REG_a"   \n\t"
1767
        "pxor                    %%mm7, %%mm7       \n\t"
1768
        "1:                                         \n\t"
1769
        PREFETCH"               64(%0)              \n\t"
1770
        "movd                     (%0), %%mm0       \n\t"
1771
        "movd                    2(%0), %%mm1       \n\t"
1772
        "movd                    6(%0), %%mm2       \n\t"
1773
        "movd                    8(%0), %%mm3       \n\t"
1774
        "add                       $12, %0          \n\t"
1775
        "punpcklbw               %%mm7, %%mm0       \n\t"
1776
        "punpcklbw               %%mm7, %%mm1       \n\t"
1777
        "punpcklbw               %%mm7, %%mm2       \n\t"
1778
        "punpcklbw               %%mm7, %%mm3       \n\t"
1779
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1780
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1781
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1782
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1783
        "paddd                   %%mm1, %%mm0       \n\t"
1784
        "paddd                   %%mm3, %%mm2       \n\t"
1785
        "paddd                   %%mm4, %%mm0       \n\t"
1786
        "paddd                   %%mm4, %%mm2       \n\t"
1787
        "psrad                     $15, %%mm0       \n\t"
1788
        "psrad                     $15, %%mm2       \n\t"
1789
        "packssdw                %%mm2, %%mm0       \n\t"
1790
        "packuswb                %%mm0, %%mm0       \n\t"
1791
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1792
        "add                        $4, %%"REG_a"   \n\t"
1793
        " js                        1b              \n\t"
1794
    : "+r" (src)
1795
    : "r" (dst+width), "g" ((x86_reg)-width)
1796
    : "%"REG_a
1797
    );
1798
}
1799

    
1800
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1801
{
1802
    __asm__ volatile(
1803
        "movq                    24(%4), %%mm6       \n\t"
1804
        "mov                        %3, %%"REG_a"   \n\t"
1805
        "pxor                    %%mm7, %%mm7       \n\t"
1806
        "1:                                         \n\t"
1807
        PREFETCH"               64(%0)              \n\t"
1808
        "movd                     (%0), %%mm0       \n\t"
1809
        "movd                    2(%0), %%mm1       \n\t"
1810
        "punpcklbw               %%mm7, %%mm0       \n\t"
1811
        "punpcklbw               %%mm7, %%mm1       \n\t"
1812
        "movq                    %%mm0, %%mm2       \n\t"
1813
        "movq                    %%mm1, %%mm3       \n\t"
1814
        "pmaddwd                  (%4), %%mm0       \n\t"
1815
        "pmaddwd                 8(%4), %%mm1       \n\t"
1816
        "pmaddwd                16(%4), %%mm2       \n\t"
1817
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1818
        "paddd                   %%mm1, %%mm0       \n\t"
1819
        "paddd                   %%mm3, %%mm2       \n\t"
1820

    
1821
        "movd                    6(%0), %%mm1       \n\t"
1822
        "movd                    8(%0), %%mm3       \n\t"
1823
        "add                       $12, %0          \n\t"
1824
        "punpcklbw               %%mm7, %%mm1       \n\t"
1825
        "punpcklbw               %%mm7, %%mm3       \n\t"
1826
        "movq                    %%mm1, %%mm4       \n\t"
1827
        "movq                    %%mm3, %%mm5       \n\t"
1828
        "pmaddwd                  (%4), %%mm1       \n\t"
1829
        "pmaddwd                 8(%4), %%mm3       \n\t"
1830
        "pmaddwd                16(%4), %%mm4       \n\t"
1831
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1832
        "paddd                   %%mm3, %%mm1       \n\t"
1833
        "paddd                   %%mm5, %%mm4       \n\t"
1834

    
1835
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1836
        "paddd                   %%mm3, %%mm0       \n\t"
1837
        "paddd                   %%mm3, %%mm2       \n\t"
1838
        "paddd                   %%mm3, %%mm1       \n\t"
1839
        "paddd                   %%mm3, %%mm4       \n\t"
1840
        "psrad                     $15, %%mm0       \n\t"
1841
        "psrad                     $15, %%mm2       \n\t"
1842
        "psrad                     $15, %%mm1       \n\t"
1843
        "psrad                     $15, %%mm4       \n\t"
1844
        "packssdw                %%mm1, %%mm0       \n\t"
1845
        "packssdw                %%mm4, %%mm2       \n\t"
1846
        "packuswb                %%mm0, %%mm0       \n\t"
1847
        "packuswb                %%mm2, %%mm2       \n\t"
1848
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1849
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1850
        "add                        $4, %%"REG_a"   \n\t"
1851
        " js                        1b              \n\t"
1852
    : "+r" (src)
1853
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1854
    : "%"REG_a
1855
    );
1856
}
1857

    
1858
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1859
{
1860
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1861
}
1862

    
1863
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1864
{
1865
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1866
    assert(src1 == src2);
1867
}
1868

    
1869
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1870
{
1871
    int i;
1872
    for (i=0; i<width; i++) {
1873
        int b= src1[6*i + 0] + src1[6*i + 3];
1874
        int g= src1[6*i + 1] + src1[6*i + 4];
1875
        int r= src1[6*i + 2] + src1[6*i + 5];
1876

    
1877
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1878
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1879
    }
1880
    assert(src1 == src2);
1881
}
1882

    
1883
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1884
{
1885
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1886
}
1887

    
1888
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1889
{
1890
    assert(src1==src2);
1891
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1892
}
1893

    
1894
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1895
{
1896
    int i;
1897
    assert(src1==src2);
1898
    for (i=0; i<width; i++) {
1899
        int r= src1[6*i + 0] + src1[6*i + 3];
1900
        int g= src1[6*i + 1] + src1[6*i + 4];
1901
        int b= src1[6*i + 2] + src1[6*i + 5];
1902

    
1903
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1904
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1905
    }
1906
}
1907

    
1908

    
1909
// bilinear / bicubic scaling
1910
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
1911
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
1912
{
1913
    assert(filterSize % 4 == 0 && filterSize>0);
1914
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1915
        x86_reg counter= -2*dstW;
1916
        filter-= counter*2;
1917
        filterPos-= counter/2;
1918
        dst-= counter/2;
1919
        __asm__ volatile(
1920
#if defined(PIC)
1921
            "push            %%"REG_b"              \n\t"
1922
#endif
1923
            "pxor                %%mm7, %%mm7       \n\t"
1924
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
1925
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
1926
            ".p2align                4              \n\t"
1927
            "1:                                     \n\t"
1928
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
1929
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
1930
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
1931
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
1932
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
1933
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
1934
            "punpcklbw           %%mm7, %%mm0       \n\t"
1935
            "punpcklbw           %%mm7, %%mm2       \n\t"
1936
            "pmaddwd             %%mm1, %%mm0       \n\t"
1937
            "pmaddwd             %%mm2, %%mm3       \n\t"
1938
            "movq                %%mm0, %%mm4       \n\t"
1939
            "punpckldq           %%mm3, %%mm0       \n\t"
1940
            "punpckhdq           %%mm3, %%mm4       \n\t"
1941
            "paddd               %%mm4, %%mm0       \n\t"
1942
            "psrad                  $7, %%mm0       \n\t"
1943
            "packssdw            %%mm0, %%mm0       \n\t"
1944
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
1945
            "add                    $4, %%"REG_BP"  \n\t"
1946
            " jnc                   1b              \n\t"
1947

    
1948
            "pop            %%"REG_BP"              \n\t"
1949
#if defined(PIC)
1950
            "pop             %%"REG_b"              \n\t"
1951
#endif
1952
            : "+a" (counter)
1953
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1954
#if !defined(PIC)
1955
            : "%"REG_b
1956
#endif
1957
        );
1958
    } else if (filterSize==8) {
1959
        x86_reg counter= -2*dstW;
1960
        filter-= counter*4;
1961
        filterPos-= counter/2;
1962
        dst-= counter/2;
1963
        __asm__ volatile(
1964
#if defined(PIC)
1965
            "push             %%"REG_b"             \n\t"
1966
#endif
1967
            "pxor                 %%mm7, %%mm7      \n\t"
1968
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
1969
            "mov              %%"REG_a", %%"REG_BP" \n\t"
1970
            ".p2align                 4             \n\t"
1971
            "1:                                     \n\t"
1972
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
1973
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
1974
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
1975
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
1976
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
1977
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
1978
            "punpcklbw            %%mm7, %%mm0      \n\t"
1979
            "punpcklbw            %%mm7, %%mm2      \n\t"
1980
            "pmaddwd              %%mm1, %%mm0      \n\t"
1981
            "pmaddwd              %%mm2, %%mm3      \n\t"
1982

    
1983
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
1984
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
1985
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
1986
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
1987
            "punpcklbw            %%mm7, %%mm4      \n\t"
1988
            "punpcklbw            %%mm7, %%mm2      \n\t"
1989
            "pmaddwd              %%mm1, %%mm4      \n\t"
1990
            "pmaddwd              %%mm2, %%mm5      \n\t"
1991
            "paddd                %%mm4, %%mm0      \n\t"
1992
            "paddd                %%mm5, %%mm3      \n\t"
1993
            "movq                 %%mm0, %%mm4      \n\t"
1994
            "punpckldq            %%mm3, %%mm0      \n\t"
1995
            "punpckhdq            %%mm3, %%mm4      \n\t"
1996
            "paddd                %%mm4, %%mm0      \n\t"
1997
            "psrad                   $7, %%mm0      \n\t"
1998
            "packssdw             %%mm0, %%mm0      \n\t"
1999
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2000
            "add                     $4, %%"REG_BP" \n\t"
2001
            " jnc                    1b             \n\t"
2002

    
2003
            "pop             %%"REG_BP"             \n\t"
2004
#if defined(PIC)
2005
            "pop              %%"REG_b"             \n\t"
2006
#endif
2007
            : "+a" (counter)
2008
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2009
#if !defined(PIC)
2010
            : "%"REG_b
2011
#endif
2012
        );
2013
    } else {
2014
        const uint8_t *offset = src+filterSize;
2015
        x86_reg counter= -2*dstW;
2016
        //filter-= counter*filterSize/2;
2017
        filterPos-= counter/2;
2018
        dst-= counter/2;
2019
        __asm__ volatile(
2020
            "pxor                  %%mm7, %%mm7     \n\t"
2021
            ".p2align                  4            \n\t"
2022
            "1:                                     \n\t"
2023
            "mov                      %2, %%"REG_c" \n\t"
2024
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2025
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2026
            "mov                      %5, %%"REG_c" \n\t"
2027
            "pxor                  %%mm4, %%mm4     \n\t"
2028
            "pxor                  %%mm5, %%mm5     \n\t"
2029
            "2:                                     \n\t"
2030
            "movq                   (%1), %%mm1     \n\t"
2031
            "movq               (%1, %6), %%mm3     \n\t"
2032
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2033
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2034
            "punpcklbw             %%mm7, %%mm0     \n\t"
2035
            "punpcklbw             %%mm7, %%mm2     \n\t"
2036
            "pmaddwd               %%mm1, %%mm0     \n\t"
2037
            "pmaddwd               %%mm2, %%mm3     \n\t"
2038
            "paddd                 %%mm3, %%mm5     \n\t"
2039
            "paddd                 %%mm0, %%mm4     \n\t"
2040
            "add                      $8, %1        \n\t"
2041
            "add                      $4, %%"REG_c" \n\t"
2042
            "cmp                      %4, %%"REG_c" \n\t"
2043
            " jb                      2b            \n\t"
2044
            "add                      %6, %1        \n\t"
2045
            "movq                  %%mm4, %%mm0     \n\t"
2046
            "punpckldq             %%mm5, %%mm4     \n\t"
2047
            "punpckhdq             %%mm5, %%mm0     \n\t"
2048
            "paddd                 %%mm0, %%mm4     \n\t"
2049
            "psrad                    $7, %%mm4     \n\t"
2050
            "packssdw              %%mm4, %%mm4     \n\t"
2051
            "mov                      %3, %%"REG_a" \n\t"
2052
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2053
            "add                      $4, %0        \n\t"
2054
            " jnc                     1b            \n\t"
2055

    
2056
            : "+r" (counter), "+r" (filter)
2057
            : "m" (filterPos), "m" (dst), "m"(offset),
2058
            "m" (src), "r" ((x86_reg)filterSize*2)
2059
            : "%"REG_a, "%"REG_c, "%"REG_d
2060
        );
2061
    }
2062
}
2063

    
2064
//FIXME all pal and rgb srcFormats could do this convertion as well
2065
//FIXME all scalers more complex than bilinear could do half of this transform
2066
static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2067
{
2068
    int i;
2069
    for (i = 0; i < width; i++) {
2070
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2071
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2072
    }
2073
}
2074
static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2075
{
2076
    int i;
2077
    for (i = 0; i < width; i++) {
2078
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2079
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2080
    }
2081
}
2082
static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2083
{
2084
    int i;
2085
    for (i = 0; i < width; i++)
2086
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2087
}
2088
static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2089
{
2090
    int i;
2091
    for (i = 0; i < width; i++)
2092
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2093
}
2094

    
2095
#define FAST_BILINEAR_X86 \
2096
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2097
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2098
    "shll      $16, %%edi    \n\t"                                              \
2099
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2100
    "mov        %1, %%"REG_D"\n\t"                                              \
2101
    "shrl       $9, %%esi    \n\t"                                              \
2102

    
2103
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2104
                                        long dstWidth, const uint8_t *src, int srcW,
2105
                                        int xInc)
2106
{
2107
#if COMPILE_TEMPLATE_MMX2
2108
    int32_t *filterPos = c->hLumFilterPos;
2109
    int16_t *filter    = c->hLumFilter;
2110
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2111
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2112
    int i;
2113
#if defined(PIC)
2114
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2115
#endif
2116
    if (canMMX2BeUsed) {
2117
        __asm__ volatile(
2118
#if defined(PIC)
2119
            "mov               %%"REG_b", %5        \n\t"
2120
#endif
2121
            "pxor                  %%mm7, %%mm7     \n\t"
2122
            "mov                      %0, %%"REG_c" \n\t"
2123
            "mov                      %1, %%"REG_D" \n\t"
2124
            "mov                      %2, %%"REG_d" \n\t"
2125
            "mov                      %3, %%"REG_b" \n\t"
2126
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2127
            PREFETCH"        (%%"REG_c")            \n\t"
2128
            PREFETCH"      32(%%"REG_c")            \n\t"
2129
            PREFETCH"      64(%%"REG_c")            \n\t"
2130

    
2131
#if ARCH_X86_64
2132

    
2133
#define CALL_MMX2_FILTER_CODE \
2134
            "movl            (%%"REG_b"), %%esi     \n\t"\
2135
            "call                    *%4            \n\t"\
2136
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2137
            "add               %%"REG_S", %%"REG_c" \n\t"\
2138
            "add               %%"REG_a", %%"REG_D" \n\t"\
2139
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2140

    
2141
#else
2142

    
2143
#define CALL_MMX2_FILTER_CODE \
2144
            "movl (%%"REG_b"), %%esi        \n\t"\
2145
            "call         *%4                       \n\t"\
2146
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2147
            "add               %%"REG_a", %%"REG_D" \n\t"\
2148
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2149

    
2150
#endif /* ARCH_X86_64 */
2151

    
2152
            CALL_MMX2_FILTER_CODE
2153
            CALL_MMX2_FILTER_CODE
2154
            CALL_MMX2_FILTER_CODE
2155
            CALL_MMX2_FILTER_CODE
2156
            CALL_MMX2_FILTER_CODE
2157
            CALL_MMX2_FILTER_CODE
2158
            CALL_MMX2_FILTER_CODE
2159
            CALL_MMX2_FILTER_CODE
2160

    
2161
#if defined(PIC)
2162
            "mov                      %5, %%"REG_b" \n\t"
2163
#endif
2164
            :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2165
            "m" (mmx2FilterCode)
2166
#if defined(PIC)
2167
            ,"m" (ebxsave)
2168
#endif
2169
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2170
#if !defined(PIC)
2171
            ,"%"REG_b
2172
#endif
2173
        );
2174
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2175
    } else {
2176
#endif /* COMPILE_TEMPLATE_MMX2 */
2177
    x86_reg xInc_shr16 = xInc >> 16;
2178
    uint16_t xInc_mask = xInc & 0xffff;
2179
    x86_reg dstWidth_reg = dstWidth;
2180
    //NO MMX just normal asm ...
2181
    __asm__ volatile(
2182
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2183
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2184
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2185
        ".p2align                4           \n\t"
2186
        "1:                                  \n\t"
2187
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2188
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2189
        FAST_BILINEAR_X86
2190
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2191
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2192
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2193

    
2194
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2195
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2196
        FAST_BILINEAR_X86
2197
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2198
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2199
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2200

    
2201

    
2202
        "add        $2, %%"REG_a"            \n\t"
2203
        "cmp        %2, %%"REG_a"            \n\t"
2204
        " jb        1b                       \n\t"
2205

    
2206

    
2207
        :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2208
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2209
    );
2210
#if COMPILE_TEMPLATE_MMX2
2211
    } //if MMX2 can't be used
2212
#endif
2213
}
2214

    
2215
      // *** horizontal scale Y line to temp buffer
2216
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2217
                                   const int16_t *hLumFilter,
2218
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2219
                                   uint8_t *formatConvBuffer,
2220
                                   uint32_t *pal, int isAlpha)
2221
{
2222
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2223
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2224

    
2225
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2226

    
2227
    if (toYV12) {
2228
        toYV12(formatConvBuffer, src, srcW, pal);
2229
        src= formatConvBuffer;
2230
    }
2231

    
2232
    if (!c->hyscale_fast) {
2233
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2234
    } else { // fast bilinear upscale / crap downscale
2235
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2236
    }
2237

    
2238
    if (convertRange)
2239
        convertRange(dst, dstWidth);
2240
}
2241

    
2242
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2243
                                        long dstWidth, const uint8_t *src1,
2244
                                        const uint8_t *src2, int srcW, int xInc)
2245
{
2246
#if COMPILE_TEMPLATE_MMX2
2247
    int32_t *filterPos = c->hChrFilterPos;
2248
    int16_t *filter    = c->hChrFilter;
2249
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2250
    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2251
    int i;
2252
#if defined(PIC)
2253
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2254
#endif
2255
    if (canMMX2BeUsed) {
2256
        __asm__ volatile(
2257
#if defined(PIC)
2258
            "mov          %%"REG_b", %6         \n\t"
2259
#endif
2260
            "pxor             %%mm7, %%mm7      \n\t"
2261
            "mov                 %0, %%"REG_c"  \n\t"
2262
            "mov                 %1, %%"REG_D"  \n\t"
2263
            "mov                 %2, %%"REG_d"  \n\t"
2264
            "mov                 %3, %%"REG_b"  \n\t"
2265
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2266
            PREFETCH"   (%%"REG_c")             \n\t"
2267
            PREFETCH" 32(%%"REG_c")             \n\t"
2268
            PREFETCH" 64(%%"REG_c")             \n\t"
2269

    
2270
            CALL_MMX2_FILTER_CODE
2271
            CALL_MMX2_FILTER_CODE
2272
            CALL_MMX2_FILTER_CODE
2273
            CALL_MMX2_FILTER_CODE
2274
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2275
            "mov                 %5, %%"REG_c"  \n\t" // src
2276
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2277
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2278
            PREFETCH"   (%%"REG_c")             \n\t"
2279
            PREFETCH" 32(%%"REG_c")             \n\t"
2280
            PREFETCH" 64(%%"REG_c")             \n\t"
2281

    
2282
            CALL_MMX2_FILTER_CODE
2283
            CALL_MMX2_FILTER_CODE
2284
            CALL_MMX2_FILTER_CODE
2285
            CALL_MMX2_FILTER_CODE
2286

    
2287
#if defined(PIC)
2288
            "mov %6, %%"REG_b"    \n\t"
2289
#endif
2290
            :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2291
            "m" (mmx2FilterCode), "m" (src2)
2292
#if defined(PIC)
2293
            ,"m" (ebxsave)
2294
#endif
2295
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2296
#if !defined(PIC)
2297
            ,"%"REG_b
2298
#endif
2299
        );
2300
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2301
            //printf("%d %d %d\n", dstWidth, i, srcW);
2302
            dst[i] = src1[srcW-1]*128;
2303
            dst[i+VOFW] = src2[srcW-1]*128;
2304
        }
2305
    } else {
2306
#endif /* COMPILE_TEMPLATE_MMX2 */
2307
        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2308
        uint16_t xInc_mask = xInc & 0xffff;
2309
        x86_reg dstWidth_reg = dstWidth;
2310
        __asm__ volatile(
2311
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2312
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2313
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2314
            ".p2align    4                          \n\t"
2315
            "1:                                     \n\t"
2316
            "mov        %0, %%"REG_S"               \n\t"
2317
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2318
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2319
            FAST_BILINEAR_X86
2320
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2321

    
2322
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2323
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2324
            FAST_BILINEAR_X86
2325
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2326

    
2327
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2328
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2329
            "add        $1, %%"REG_a"               \n\t"
2330
            "cmp        %2, %%"REG_a"               \n\t"
2331
            " jb        1b                          \n\t"
2332

    
2333
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2334
which is needed to support GCC 4.0. */
2335
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2336
            :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2337
#else
2338
            :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2339
#endif
2340
            "r" (src2)
2341
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2342
        );
2343
#if COMPILE_TEMPLATE_MMX2
2344
    } //if MMX2 can't be used
2345
#endif
2346
}
2347

    
2348
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2349
                                   int srcW, int xInc, const int16_t *hChrFilter,
2350
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2351
                                   uint8_t *formatConvBuffer,
2352
                                   uint32_t *pal)
2353
{
2354

    
2355
    src1 += c->chrSrcOffset;
2356
    src2 += c->chrSrcOffset;
2357

    
2358
    if (c->chrToYV12) {
2359
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2360
        src1= formatConvBuffer;
2361
        src2= formatConvBuffer+VOFW;
2362
    }
2363

    
2364
    if (!c->hcscale_fast) {
2365
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2366
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2367
    } else { // fast bilinear upscale / crap downscale
2368
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2369
    }
2370

    
2371
    if (c->chrConvertRange)
2372
        c->chrConvertRange(dst, dstWidth);
2373
}
2374

    
2375
#define DEBUG_SWSCALE_BUFFERS 0
2376
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2377

    
2378
static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2379
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2380
{
2381
    /* load a few things into local vars to make the code more readable? and faster */
2382
    const int srcW= c->srcW;
2383
    const int dstW= c->dstW;
2384
    const int dstH= c->dstH;
2385
    const int chrDstW= c->chrDstW;
2386
    const int chrSrcW= c->chrSrcW;
2387
    const int lumXInc= c->lumXInc;
2388
    const int chrXInc= c->chrXInc;
2389
    const enum PixelFormat dstFormat= c->dstFormat;
2390
    const int flags= c->flags;
2391
    int16_t *vLumFilterPos= c->vLumFilterPos;
2392
    int16_t *vChrFilterPos= c->vChrFilterPos;
2393
    int16_t *hLumFilterPos= c->hLumFilterPos;
2394
    int16_t *hChrFilterPos= c->hChrFilterPos;
2395
    int16_t *vLumFilter= c->vLumFilter;
2396
    int16_t *vChrFilter= c->vChrFilter;
2397
    int16_t *hLumFilter= c->hLumFilter;
2398
    int16_t *hChrFilter= c->hChrFilter;
2399
    int32_t *lumMmxFilter= c->lumMmxFilter;
2400
    int32_t *chrMmxFilter= c->chrMmxFilter;
2401
    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2402
    const int vLumFilterSize= c->vLumFilterSize;
2403
    const int vChrFilterSize= c->vChrFilterSize;
2404
    const int hLumFilterSize= c->hLumFilterSize;
2405
    const int hChrFilterSize= c->hChrFilterSize;
2406
    int16_t **lumPixBuf= c->lumPixBuf;
2407
    int16_t **chrPixBuf= c->chrPixBuf;
2408
    int16_t **alpPixBuf= c->alpPixBuf;
2409
    const int vLumBufSize= c->vLumBufSize;
2410
    const int vChrBufSize= c->vChrBufSize;
2411
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2412
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2413
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2414
    int lastDstY;
2415
    uint32_t *pal=c->pal_yuv;
2416

    
2417
    /* vars which will change and which we need to store back in the context */
2418
    int dstY= c->dstY;
2419
    int lumBufIndex= c->lumBufIndex;
2420
    int chrBufIndex= c->chrBufIndex;
2421
    int lastInLumBuf= c->lastInLumBuf;
2422
    int lastInChrBuf= c->lastInChrBuf;
2423

    
2424
    if (isPacked(c->srcFormat)) {
2425
        src[0]=
2426
        src[1]=
2427
        src[2]=
2428
        src[3]= src[0];
2429
        srcStride[0]=
2430
        srcStride[1]=
2431
        srcStride[2]=
2432
        srcStride[3]= srcStride[0];
2433
    }
2434
    srcStride[1]<<= c->vChrDrop;
2435
    srcStride[2]<<= c->vChrDrop;
2436

    
2437
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2438
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2439
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2440
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2441
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2442
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2443
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2444

    
2445
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2446
        static int warnedAlready=0; //FIXME move this into the context perhaps
2447
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2448
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2449
                   "         ->cannot do aligned memory accesses anymore\n");
2450
            warnedAlready=1;
2451
        }
2452
    }
2453

    
2454
    /* Note the user might start scaling the picture in the middle so this
2455
       will not get executed. This is not really intended but works
2456
       currently, so people might do it. */
2457
    if (srcSliceY ==0) {
2458
        lumBufIndex=-1;
2459
        chrBufIndex=-1;
2460
        dstY=0;
2461
        lastInLumBuf= -1;
2462
        lastInChrBuf= -1;
2463
    }
2464

    
2465
    lastDstY= dstY;
2466

    
2467
    for (;dstY < dstH; dstY++) {
2468
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2469
        const int chrDstY= dstY>>c->chrDstVSubSample;
2470
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2471
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2472
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2473

    
2474
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2475
        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2476
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2477
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2478
        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2479
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2480
        int enough_lines;
2481

    
2482
        //handle holes (FAST_BILINEAR & weird filters)
2483
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2484
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2485
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2486
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2487

    
2488
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2489
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2490
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2491
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2492
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2493

    
2494
        // Do we have enough lines in this slice to output the dstY line
2495
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2496

    
2497
        if (!enough_lines) {
2498
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2499
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2500
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2501
                                            lastLumSrcY, lastChrSrcY);
2502
        }
2503

    
2504
        //Do horizontal scaling
2505
        while(lastInLumBuf < lastLumSrcY) {
2506
            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2507
            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2508
            lumBufIndex++;
2509
            assert(lumBufIndex < 2*vLumBufSize);
2510
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2511
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2512
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2513
                            hLumFilter, hLumFilterPos, hLumFilterSize,
2514
                            formatConvBuffer,
2515
                            pal, 0);
2516
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2517
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2518
                                hLumFilter, hLumFilterPos, hLumFilterSize,
2519
                                formatConvBuffer,
2520
                                pal, 1);
2521
            lastInLumBuf++;
2522
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2523
                               lumBufIndex,    lastInLumBuf);
2524
        }
2525
        while(lastInChrBuf < lastChrSrcY) {
2526
            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2527
            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2528
            chrBufIndex++;
2529
            assert(chrBufIndex < 2*vChrBufSize);
2530
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2531
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2532
            //FIXME replace parameters through context struct (some at least)
2533

    
2534
            if (c->needs_hcscale)
2535
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2536
                                hChrFilter, hChrFilterPos, hChrFilterSize,
2537
                                formatConvBuffer,
2538
                                pal);
2539
            lastInChrBuf++;
2540
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2541
                               chrBufIndex,    lastInChrBuf);
2542
        }
2543
        //wrap buf index around to stay inside the ring buffer
2544
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2545
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2546
        if (!enough_lines)
2547
            break; //we can't output a dstY line so let's try with the next slice
2548

    
2549
        c->blueDither= ff_dither8[dstY&1];
2550
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2551
            c->greenDither= ff_dither8[dstY&1];
2552
        else
2553
            c->greenDither= ff_dither4[dstY&1];
2554
        c->redDither= ff_dither8[(dstY+1)&1];
2555
        if (dstY < dstH-2) {
2556
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2557
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2558
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2559
            int i;
2560
            if (flags & SWS_ACCURATE_RND) {
2561
                int s= APCK_SIZE / 8;
2562
                for (i=0; i<vLumFilterSize; i+=2) {
2563
                    *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2564
                    *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2565
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2566
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2567
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2568
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2569
                        *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2570
                        *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2571
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2572
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2573
                    }
2574
                }
2575
                for (i=0; i<vChrFilterSize; i+=2) {
2576
                    *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2577
                    *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2578
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2579
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2580
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2581
                }
2582
            } else {
2583
                for (i=0; i<vLumFilterSize; i++) {
2584
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2585
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2586
                    lumMmxFilter[4*i+2]=
2587
                    lumMmxFilter[4*i+3]=
2588
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2589
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2590
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2591
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2592
                        alpMmxFilter[4*i+2]=
2593
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2594
                    }
2595
                }
2596
                for (i=0; i<vChrFilterSize; i++) {
2597
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2598
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2599
                    chrMmxFilter[4*i+2]=
2600
                    chrMmxFilter[4*i+3]=
2601
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2602
                }
2603
            }
2604
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2605
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2606
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2607
                c->yuv2nv12X(c,
2608
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2609
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2610
                             dest, uDest, dstW, chrDstW, dstFormat);
2611
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2612
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2613
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2614
                if (is16BPS(dstFormat)) {
2615
                    yuv2yuvX16inC(
2616
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2617
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2618
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2619
                                  dstFormat);
2620
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2621
                    const int16_t *lumBuf = lumSrcPtr[0];
2622
                    const int16_t *chrBuf= chrSrcPtr[0];
2623
                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2624
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2625
                } else { //General YV12
2626
                    c->yuv2yuvX(c,
2627
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2628
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2629
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2630
                }
2631
            } else {
2632
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2633
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2634
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2635
                    int chrAlpha= vChrFilter[2*dstY+1];
2636
                    if(flags & SWS_FULL_CHR_H_INT) {
2637
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2638
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2639
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2640
                                         alpSrcPtr, dest, dstW, dstY);
2641
                    } else {
2642
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2643
                                       alpPixBuf ? *alpSrcPtr : NULL,
2644
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2645
                    }
2646
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2647
                    int lumAlpha= vLumFilter[2*dstY+1];
2648
                    int chrAlpha= vChrFilter[2*dstY+1];
2649
                    lumMmxFilter[2]=
2650
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2651
                    chrMmxFilter[2]=
2652
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2653
                    if(flags & SWS_FULL_CHR_H_INT) {
2654
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2655
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2656
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2657
                                         alpSrcPtr, dest, dstW, dstY);
2658
                    } else {
2659
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2660
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2661
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2662
                    }
2663
                } else { //general RGB
2664
                    if(flags & SWS_FULL_CHR_H_INT) {
2665
                        yuv2rgbXinC_full(c,
2666
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2667
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2668
                                         alpSrcPtr, dest, dstW, dstY);
2669
                    } else {
2670
                        c->yuv2packedX(c,
2671
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2672
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2673
                                       alpSrcPtr, dest, dstW, dstY);
2674
                    }
2675
                }
2676
            }
2677
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2678
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2679
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2680
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2681
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2682
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2683
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2684
                yuv2nv12XinC(
2685
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2686
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2687
                             dest, uDest, dstW, chrDstW, dstFormat);
2688
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2689
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2690
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2691
                if (is16BPS(dstFormat)) {
2692
                    yuv2yuvX16inC(
2693
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2694
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2695
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2696
                                  dstFormat);
2697
                } else {
2698
                    yuv2yuvXinC(
2699
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2700
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2701
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2702
                }
2703
            } else {
2704
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2705
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2706
                if(flags & SWS_FULL_CHR_H_INT) {
2707
                    yuv2rgbXinC_full(c,
2708
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2709
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2710
                                     alpSrcPtr, dest, dstW, dstY);
2711
                } else {
2712
                    yuv2packedXinC(c,
2713
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2714
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2715
                                   alpSrcPtr, dest, dstW, dstY);
2716
                }
2717
            }
2718
        }
2719
    }
2720

    
2721
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2722
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2723

    
2724
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2725
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2726
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2727
    else                             __asm__ volatile("emms"  :::"memory");
2728
    /* store changed local vars back in the context */
2729
    c->dstY= dstY;
2730
    c->lumBufIndex= lumBufIndex;
2731
    c->chrBufIndex= chrBufIndex;
2732
    c->lastInLumBuf= lastInLumBuf;
2733
    c->lastInChrBuf= lastInChrBuf;
2734

    
2735
    return dstY - lastDstY;
2736
}
2737

    
2738
static void RENAME(sws_init_swScale)(SwsContext *c)
2739
{
2740
    enum PixelFormat srcFormat = c->srcFormat;
2741

    
2742
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2743
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2744
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2745
    c->yuv2packed1  = RENAME(yuv2packed1 );
2746
    c->yuv2packed2  = RENAME(yuv2packed2 );
2747
    c->yuv2packedX  = RENAME(yuv2packedX );
2748

    
2749
    c->hScale       = RENAME(hScale      );
2750

    
2751
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2752
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2753
    {
2754
        c->hyscale_fast = RENAME(hyscale_fast);
2755
        c->hcscale_fast = RENAME(hcscale_fast);
2756
    } else {
2757
        c->hyscale_fast = NULL;
2758
        c->hcscale_fast = NULL;
2759
    }
2760

    
2761
    switch(srcFormat) {
2762
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2763
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2764
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2765
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2766
        case PIX_FMT_YUV420P16BE:
2767
        case PIX_FMT_YUV422P16BE:
2768
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2769
        case PIX_FMT_YUV420P16LE:
2770
        case PIX_FMT_YUV422P16LE:
2771
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2772
        default: break;
2773
    }
2774
    if (c->chrSrcHSubSample) {
2775
        switch(srcFormat) {
2776
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2777
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2778
        default: break;
2779
        }
2780
    } else {
2781
        switch(srcFormat) {
2782
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2783
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
2784
        default: break;
2785
        }
2786
    }
2787

    
2788
    switch (srcFormat) {
2789
    case PIX_FMT_YUYV422  :
2790
    case PIX_FMT_YUV420P16BE:
2791
    case PIX_FMT_YUV422P16BE:
2792
    case PIX_FMT_YUV444P16BE:
2793
    case PIX_FMT_Y400A    :
2794
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2795
    case PIX_FMT_UYVY422  :
2796
    case PIX_FMT_YUV420P16LE:
2797
    case PIX_FMT_YUV422P16LE:
2798
    case PIX_FMT_YUV444P16LE:
2799
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2800
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
2801
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
2802
    default: break;
2803
    }
2804
    if (c->alpPixBuf) {
2805
        switch (srcFormat) {
2806
        case PIX_FMT_Y400A  : c->alpToYV12 = RENAME(yuy2ToY); break;
2807
        default: break;
2808
        }
2809
    }
2810

    
2811
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2812
        if (c->srcRange) {
2813
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
2814
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
2815
        } else {
2816
            c->lumConvertRange = RENAME(lumRangeToJpeg);
2817
            c->chrConvertRange = RENAME(chrRangeToJpeg);
2818
        }
2819
    }
2820
}