Statistics
| Branch: | Revision:

ffmpeg / libswscale / x86 / swscale_template.c @ 86330b4c

History | View | Annotate | Download (131 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of Libav.
5
 *
6
 * Libav is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * Libav is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with Libav; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#undef REAL_MOVNTQ
22
#undef MOVNTQ
23
#undef PAVGB
24
#undef PREFETCH
25

    
26
#if COMPILE_TEMPLATE_AMD3DNOW
27
#define PREFETCH  "prefetch"
28
#elif COMPILE_TEMPLATE_MMX2
29
#define PREFETCH "prefetchnta"
30
#else
31
#define PREFETCH  " # nop"
32
#endif
33

    
34
#if COMPILE_TEMPLATE_MMX2
35
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36
#elif COMPILE_TEMPLATE_AMD3DNOW
37
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38
#endif
39

    
40
#if COMPILE_TEMPLATE_MMX2
41
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42
#else
43
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44
#endif
45
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46

    
47
#define YSCALEYUV2YV12X(x, offset, dest, width) \
48
    __asm__ volatile(\
49
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
50
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
51
        "movq                             %%mm3, %%mm4      \n\t"\
52
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
53
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
54
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
55
        "1:                                                 \n\t"\
56
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
57
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
58
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
59
        "add                                $16, %%"REG_d"  \n\t"\
60
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
61
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
62
        "pmulhw                           %%mm0, %%mm2      \n\t"\
63
        "pmulhw                           %%mm0, %%mm5      \n\t"\
64
        "paddw                            %%mm2, %%mm3      \n\t"\
65
        "paddw                            %%mm5, %%mm4      \n\t"\
66
        " jnz                                1b             \n\t"\
67
        "psraw                               $3, %%mm3      \n\t"\
68
        "psraw                               $3, %%mm4      \n\t"\
69
        "packuswb                         %%mm4, %%mm3      \n\t"\
70
        MOVNTQ(%%mm3, (%1, %%REGa))\
71
        "add                                 $8, %%"REG_a"  \n\t"\
72
        "cmp                                 %2, %%"REG_a"  \n\t"\
73
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
74
        "movq                             %%mm3, %%mm4      \n\t"\
75
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
76
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
77
        "jb                                  1b             \n\t"\
78
        :: "r" (&c->redDither),\
79
        "r" (dest), "g" ((x86_reg)width)\
80
        : "%"REG_a, "%"REG_d, "%"REG_S\
81
    );
82

    
83
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
84
    __asm__ volatile(\
85
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
86
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
87
        "pxor                             %%mm4, %%mm4      \n\t"\
88
        "pxor                             %%mm5, %%mm5      \n\t"\
89
        "pxor                             %%mm6, %%mm6      \n\t"\
90
        "pxor                             %%mm7, %%mm7      \n\t"\
91
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
92
        ".p2align                             4             \n\t"\
93
        "1:                                                 \n\t"\
94
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
95
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
96
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
97
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
98
        "movq                             %%mm0, %%mm3      \n\t"\
99
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
100
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
101
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
102
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
103
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
104
        "paddd                            %%mm0, %%mm4      \n\t"\
105
        "paddd                            %%mm3, %%mm5      \n\t"\
106
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
107
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
108
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
109
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
110
        "movq                             %%mm2, %%mm0      \n\t"\
111
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
112
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
113
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
114
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
115
        "paddd                            %%mm2, %%mm6      \n\t"\
116
        "paddd                            %%mm0, %%mm7      \n\t"\
117
        " jnz                                1b             \n\t"\
118
        "psrad                              $16, %%mm4      \n\t"\
119
        "psrad                              $16, %%mm5      \n\t"\
120
        "psrad                              $16, %%mm6      \n\t"\
121
        "psrad                              $16, %%mm7      \n\t"\
122
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
123
        "packssdw                         %%mm5, %%mm4      \n\t"\
124
        "packssdw                         %%mm7, %%mm6      \n\t"\
125
        "paddw                            %%mm0, %%mm4      \n\t"\
126
        "paddw                            %%mm0, %%mm6      \n\t"\
127
        "psraw                               $3, %%mm4      \n\t"\
128
        "psraw                               $3, %%mm6      \n\t"\
129
        "packuswb                         %%mm6, %%mm4      \n\t"\
130
        MOVNTQ(%%mm4, (%1, %%REGa))\
131
        "add                                 $8, %%"REG_a"  \n\t"\
132
        "cmp                                 %2, %%"REG_a"  \n\t"\
133
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
134
        "pxor                             %%mm4, %%mm4      \n\t"\
135
        "pxor                             %%mm5, %%mm5      \n\t"\
136
        "pxor                             %%mm6, %%mm6      \n\t"\
137
        "pxor                             %%mm7, %%mm7      \n\t"\
138
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
139
        "jb                                  1b             \n\t"\
140
        :: "r" (&c->redDither),\
141
        "r" (dest), "g" ((x86_reg)width)\
142
        : "%"REG_a, "%"REG_d, "%"REG_S\
143
    );
144

    
145
#define YSCALEYUV2YV121 \
146
    "mov %2, %%"REG_a"                    \n\t"\
147
    ".p2align               4             \n\t" /* FIXME Unroll? */\
148
    "1:                                   \n\t"\
149
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
150
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
151
    "psraw                 $7, %%mm0      \n\t"\
152
    "psraw                 $7, %%mm1      \n\t"\
153
    "packuswb           %%mm1, %%mm0      \n\t"\
154
    MOVNTQ(%%mm0, (%1, %%REGa))\
155
    "add                   $8, %%"REG_a"  \n\t"\
156
    "jnc                   1b             \n\t"
157

    
158
#define YSCALEYUV2YV121_ACCURATE \
159
    "mov %2, %%"REG_a"                    \n\t"\
160
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
161
    "psrlw                 $15, %%mm7     \n\t"\
162
    "psllw                  $6, %%mm7     \n\t"\
163
    ".p2align                4            \n\t" /* FIXME Unroll? */\
164
    "1:                                   \n\t"\
165
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
166
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
167
    "paddsw             %%mm7, %%mm0      \n\t"\
168
    "paddsw             %%mm7, %%mm1      \n\t"\
169
    "psraw                 $7, %%mm0      \n\t"\
170
    "psraw                 $7, %%mm1      \n\t"\
171
    "packuswb           %%mm1, %%mm0      \n\t"\
172
    MOVNTQ(%%mm0, (%1, %%REGa))\
173
    "add                   $8, %%"REG_a"  \n\t"\
174
    "jnc                   1b             \n\t"
175

    
176
/*
177
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
178
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
179
       "r" (dest), "m" (dstW_reg),
180
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
181
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
182
*/
183
#define YSCALEYUV2PACKEDX_UV \
184
    __asm__ volatile(\
185
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
186
        ".p2align                      4                \n\t"\
187
        "nop                                            \n\t"\
188
        "1:                                             \n\t"\
189
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
190
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
191
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
192
        "movq                      %%mm3, %%mm4         \n\t"\
193
        ".p2align                      4                \n\t"\
194
        "2:                                             \n\t"\
195
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
196
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
197
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
198
        "add                         $16, %%"REG_d"     \n\t"\
199
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
200
        "pmulhw                    %%mm0, %%mm2         \n\t"\
201
        "pmulhw                    %%mm0, %%mm5         \n\t"\
202
        "paddw                     %%mm2, %%mm3         \n\t"\
203
        "paddw                     %%mm5, %%mm4         \n\t"\
204
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
205
        " jnz                         2b                \n\t"\
206

    
207
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
208
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
209
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
210
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
211
    "movq                    "#dst1", "#dst2"       \n\t"\
212
    ".p2align                      4                \n\t"\
213
    "2:                                             \n\t"\
214
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
215
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
216
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
217
    "add                         $16, %%"REG_d"            \n\t"\
218
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
219
    "pmulhw                 "#coeff", "#src1"       \n\t"\
220
    "pmulhw                 "#coeff", "#src2"       \n\t"\
221
    "paddw                   "#src1", "#dst1"       \n\t"\
222
    "paddw                   "#src2", "#dst2"       \n\t"\
223
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
224
    " jnz                         2b                \n\t"\
225

    
226
#define YSCALEYUV2PACKEDX \
227
    YSCALEYUV2PACKEDX_UV \
228
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
229

    
230
#define YSCALEYUV2PACKEDX_END                     \
231
        :: "r" (&c->redDither),                   \
232
            "m" (dummy), "m" (dummy), "m" (dummy),\
233
            "r" (dest), "m" (dstW_reg)            \
234
        : "%"REG_a, "%"REG_d, "%"REG_S            \
235
    );
236

    
237
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
238
    __asm__ volatile(\
239
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
240
        ".p2align                      4                \n\t"\
241
        "nop                                            \n\t"\
242
        "1:                                             \n\t"\
243
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
244
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
245
        "pxor                      %%mm4, %%mm4         \n\t"\
246
        "pxor                      %%mm5, %%mm5         \n\t"\
247
        "pxor                      %%mm6, %%mm6         \n\t"\
248
        "pxor                      %%mm7, %%mm7         \n\t"\
249
        ".p2align                      4                \n\t"\
250
        "2:                                             \n\t"\
251
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
252
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
253
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
254
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
255
        "movq                      %%mm0, %%mm3         \n\t"\
256
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
257
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
258
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
259
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
260
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
261
        "paddd                     %%mm0, %%mm4         \n\t"\
262
        "paddd                     %%mm3, %%mm5         \n\t"\
263
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
264
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
265
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
266
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
267
        "movq                      %%mm2, %%mm0         \n\t"\
268
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
269
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
270
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
271
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
272
        "paddd                     %%mm2, %%mm6         \n\t"\
273
        "paddd                     %%mm0, %%mm7         \n\t"\
274
        " jnz                         2b                \n\t"\
275
        "psrad                       $16, %%mm4         \n\t"\
276
        "psrad                       $16, %%mm5         \n\t"\
277
        "psrad                       $16, %%mm6         \n\t"\
278
        "psrad                       $16, %%mm7         \n\t"\
279
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
280
        "packssdw                  %%mm5, %%mm4         \n\t"\
281
        "packssdw                  %%mm7, %%mm6         \n\t"\
282
        "paddw                     %%mm0, %%mm4         \n\t"\
283
        "paddw                     %%mm0, %%mm6         \n\t"\
284
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
285
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
286

    
287
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
288
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
289
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
290
    "pxor                      %%mm1, %%mm1         \n\t"\
291
    "pxor                      %%mm5, %%mm5         \n\t"\
292
    "pxor                      %%mm7, %%mm7         \n\t"\
293
    "pxor                      %%mm6, %%mm6         \n\t"\
294
    ".p2align                      4                \n\t"\
295
    "2:                                             \n\t"\
296
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
297
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
298
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
299
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
300
    "movq                      %%mm0, %%mm3         \n\t"\
301
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
302
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
303
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
304
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
305
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
306
    "paddd                     %%mm0, %%mm1         \n\t"\
307
    "paddd                     %%mm3, %%mm5         \n\t"\
308
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
309
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
310
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
311
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
312
    "movq                      %%mm2, %%mm0         \n\t"\
313
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
314
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
315
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
316
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
317
    "paddd                     %%mm2, %%mm7         \n\t"\
318
    "paddd                     %%mm0, %%mm6         \n\t"\
319
    " jnz                         2b                \n\t"\
320
    "psrad                       $16, %%mm1         \n\t"\
321
    "psrad                       $16, %%mm5         \n\t"\
322
    "psrad                       $16, %%mm7         \n\t"\
323
    "psrad                       $16, %%mm6         \n\t"\
324
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
325
    "packssdw                  %%mm5, %%mm1         \n\t"\
326
    "packssdw                  %%mm6, %%mm7         \n\t"\
327
    "paddw                     %%mm0, %%mm1         \n\t"\
328
    "paddw                     %%mm0, %%mm7         \n\t"\
329
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
330
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
331

    
332
#define YSCALEYUV2PACKEDX_ACCURATE \
333
    YSCALEYUV2PACKEDX_ACCURATE_UV \
334
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
335

    
336
#define YSCALEYUV2RGBX \
337
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
338
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
339
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
340
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
341
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
342
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
343
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
344
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
345
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
346
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
347
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
348
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
349
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
350
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
351
    "paddw           %%mm3, %%mm4       \n\t"\
352
    "movq            %%mm2, %%mm0       \n\t"\
353
    "movq            %%mm5, %%mm6       \n\t"\
354
    "movq            %%mm4, %%mm3       \n\t"\
355
    "punpcklwd       %%mm2, %%mm2       \n\t"\
356
    "punpcklwd       %%mm5, %%mm5       \n\t"\
357
    "punpcklwd       %%mm4, %%mm4       \n\t"\
358
    "paddw           %%mm1, %%mm2       \n\t"\
359
    "paddw           %%mm1, %%mm5       \n\t"\
360
    "paddw           %%mm1, %%mm4       \n\t"\
361
    "punpckhwd       %%mm0, %%mm0       \n\t"\
362
    "punpckhwd       %%mm6, %%mm6       \n\t"\
363
    "punpckhwd       %%mm3, %%mm3       \n\t"\
364
    "paddw           %%mm7, %%mm0       \n\t"\
365
    "paddw           %%mm7, %%mm6       \n\t"\
366
    "paddw           %%mm7, %%mm3       \n\t"\
367
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
368
    "packuswb        %%mm0, %%mm2       \n\t"\
369
    "packuswb        %%mm6, %%mm5       \n\t"\
370
    "packuswb        %%mm3, %%mm4       \n\t"\
371

    
372
#define REAL_YSCALEYUV2PACKED(index, c) \
373
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
374
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
375
    "psraw                $3, %%mm0                           \n\t"\
376
    "psraw                $3, %%mm1                           \n\t"\
377
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
378
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
379
    "xor            "#index", "#index"                        \n\t"\
380
    ".p2align              4            \n\t"\
381
    "1:                                 \n\t"\
382
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
383
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
384
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
385
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
386
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
387
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
388
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
389
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
391
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
392
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
393
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
394
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
395
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
396
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
397
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
398
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
399
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
400
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
401
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
402
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
403
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
404
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
405
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
406
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
407

    
408
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
409

    
410
#define REAL_YSCALEYUV2RGB_UV(index, c) \
411
    "xor            "#index", "#index"  \n\t"\
412
    ".p2align              4            \n\t"\
413
    "1:                                 \n\t"\
414
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
415
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
416
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
417
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
418
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
419
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
420
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
421
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
422
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
423
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
424
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
425
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
426
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
427
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
428
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
429
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
430
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
431
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
432
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
433
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
434

    
435
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
436
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
437
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
438
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
439
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
440
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
441
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
442
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
443
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
444
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
445
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
446
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
447
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
448

    
449
#define REAL_YSCALEYUV2RGB_COEFF(c) \
450
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
451
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
452
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
453
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
454
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
455
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
456
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
457
    "paddw             %%mm3, %%mm4     \n\t"\
458
    "movq              %%mm2, %%mm0     \n\t"\
459
    "movq              %%mm5, %%mm6     \n\t"\
460
    "movq              %%mm4, %%mm3     \n\t"\
461
    "punpcklwd         %%mm2, %%mm2     \n\t"\
462
    "punpcklwd         %%mm5, %%mm5     \n\t"\
463
    "punpcklwd         %%mm4, %%mm4     \n\t"\
464
    "paddw             %%mm1, %%mm2     \n\t"\
465
    "paddw             %%mm1, %%mm5     \n\t"\
466
    "paddw             %%mm1, %%mm4     \n\t"\
467
    "punpckhwd         %%mm0, %%mm0     \n\t"\
468
    "punpckhwd         %%mm6, %%mm6     \n\t"\
469
    "punpckhwd         %%mm3, %%mm3     \n\t"\
470
    "paddw             %%mm7, %%mm0     \n\t"\
471
    "paddw             %%mm7, %%mm6     \n\t"\
472
    "paddw             %%mm7, %%mm3     \n\t"\
473
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
474
    "packuswb          %%mm0, %%mm2     \n\t"\
475
    "packuswb          %%mm6, %%mm5     \n\t"\
476
    "packuswb          %%mm3, %%mm4     \n\t"\
477

    
478
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
479

    
480
#define YSCALEYUV2RGB(index, c) \
481
    REAL_YSCALEYUV2RGB_UV(index, c) \
482
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
483
    REAL_YSCALEYUV2RGB_COEFF(c)
484

    
485
#define REAL_YSCALEYUV2PACKED1(index, c) \
486
    "xor            "#index", "#index"  \n\t"\
487
    ".p2align              4            \n\t"\
488
    "1:                                 \n\t"\
489
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
490
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
491
    "psraw                $7, %%mm3     \n\t" \
492
    "psraw                $7, %%mm4     \n\t" \
493
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
494
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
495
    "psraw                $7, %%mm1     \n\t" \
496
    "psraw                $7, %%mm7     \n\t" \
497

    
498
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
499

    
500
#define REAL_YSCALEYUV2RGB1(index, c) \
501
    "xor            "#index", "#index"  \n\t"\
502
    ".p2align              4            \n\t"\
503
    "1:                                 \n\t"\
504
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
505
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
506
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
507
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
508
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
509
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
510
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
511
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
512
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
513
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
514
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
515
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
516
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
517
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
518
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
519
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
520
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
521
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
522
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
523
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
524
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
525
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
526
    "paddw             %%mm3, %%mm4     \n\t"\
527
    "movq              %%mm2, %%mm0     \n\t"\
528
    "movq              %%mm5, %%mm6     \n\t"\
529
    "movq              %%mm4, %%mm3     \n\t"\
530
    "punpcklwd         %%mm2, %%mm2     \n\t"\
531
    "punpcklwd         %%mm5, %%mm5     \n\t"\
532
    "punpcklwd         %%mm4, %%mm4     \n\t"\
533
    "paddw             %%mm1, %%mm2     \n\t"\
534
    "paddw             %%mm1, %%mm5     \n\t"\
535
    "paddw             %%mm1, %%mm4     \n\t"\
536
    "punpckhwd         %%mm0, %%mm0     \n\t"\
537
    "punpckhwd         %%mm6, %%mm6     \n\t"\
538
    "punpckhwd         %%mm3, %%mm3     \n\t"\
539
    "paddw             %%mm7, %%mm0     \n\t"\
540
    "paddw             %%mm7, %%mm6     \n\t"\
541
    "paddw             %%mm7, %%mm3     \n\t"\
542
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
543
    "packuswb          %%mm0, %%mm2     \n\t"\
544
    "packuswb          %%mm6, %%mm5     \n\t"\
545
    "packuswb          %%mm3, %%mm4     \n\t"\
546

    
547
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
548

    
549
#define REAL_YSCALEYUV2PACKED1b(index, c) \
550
    "xor "#index", "#index"             \n\t"\
551
    ".p2align              4            \n\t"\
552
    "1:                                 \n\t"\
553
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
554
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
555
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
556
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
557
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
558
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
559
    "psrlw                $8, %%mm3     \n\t" \
560
    "psrlw                $8, %%mm4     \n\t" \
561
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
562
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
563
    "psraw                $7, %%mm1     \n\t" \
564
    "psraw                $7, %%mm7     \n\t"
565
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
566

    
567
// do vertical chrominance interpolation
568
#define REAL_YSCALEYUV2RGB1b(index, c) \
569
    "xor            "#index", "#index"  \n\t"\
570
    ".p2align              4            \n\t"\
571
    "1:                                 \n\t"\
572
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
573
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
574
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
575
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
576
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
577
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
578
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
579
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
580
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
581
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
582
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
583
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
584
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
585
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
586
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
587
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
588
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
589
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
590
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
591
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
592
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
593
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
594
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
595
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
596
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
597
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
598
    "paddw             %%mm3, %%mm4     \n\t"\
599
    "movq              %%mm2, %%mm0     \n\t"\
600
    "movq              %%mm5, %%mm6     \n\t"\
601
    "movq              %%mm4, %%mm3     \n\t"\
602
    "punpcklwd         %%mm2, %%mm2     \n\t"\
603
    "punpcklwd         %%mm5, %%mm5     \n\t"\
604
    "punpcklwd         %%mm4, %%mm4     \n\t"\
605
    "paddw             %%mm1, %%mm2     \n\t"\
606
    "paddw             %%mm1, %%mm5     \n\t"\
607
    "paddw             %%mm1, %%mm4     \n\t"\
608
    "punpckhwd         %%mm0, %%mm0     \n\t"\
609
    "punpckhwd         %%mm6, %%mm6     \n\t"\
610
    "punpckhwd         %%mm3, %%mm3     \n\t"\
611
    "paddw             %%mm7, %%mm0     \n\t"\
612
    "paddw             %%mm7, %%mm6     \n\t"\
613
    "paddw             %%mm7, %%mm3     \n\t"\
614
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
615
    "packuswb          %%mm0, %%mm2     \n\t"\
616
    "packuswb          %%mm6, %%mm5     \n\t"\
617
    "packuswb          %%mm3, %%mm4     \n\t"\
618

    
619
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
620

    
621
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
622
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
623
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
624
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
625
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
626
    "packuswb          %%mm1, %%mm7     \n\t"
627
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
628

    
629
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
630
    "movq       "#b", "#q2"     \n\t" /* B */\
631
    "movq       "#r", "#t"      \n\t" /* R */\
632
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
633
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
634
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
635
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
636
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
637
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
638
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
639
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
640
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
641
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
642
\
643
    MOVNTQ(   q0,   (dst, index, 4))\
644
    MOVNTQ(    b,  8(dst, index, 4))\
645
    MOVNTQ(   q2, 16(dst, index, 4))\
646
    MOVNTQ(   q3, 24(dst, index, 4))\
647
\
648
    "add      $8, "#index"      \n\t"\
649
    "cmp "#dstw", "#index"      \n\t"\
650
    " jb      1b                \n\t"
651
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
652

    
653
#define REAL_WRITERGB16(dst, dstw, index) \
654
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
655
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
656
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
657
    "psrlq           $3, %%mm2  \n\t"\
658
\
659
    "movq         %%mm2, %%mm1  \n\t"\
660
    "movq         %%mm4, %%mm3  \n\t"\
661
\
662
    "punpcklbw    %%mm7, %%mm3  \n\t"\
663
    "punpcklbw    %%mm5, %%mm2  \n\t"\
664
    "punpckhbw    %%mm7, %%mm4  \n\t"\
665
    "punpckhbw    %%mm5, %%mm1  \n\t"\
666
\
667
    "psllq           $3, %%mm3  \n\t"\
668
    "psllq           $3, %%mm4  \n\t"\
669
\
670
    "por          %%mm3, %%mm2  \n\t"\
671
    "por          %%mm4, %%mm1  \n\t"\
672
\
673
    MOVNTQ(%%mm2,  (dst, index, 2))\
674
    MOVNTQ(%%mm1, 8(dst, index, 2))\
675
\
676
    "add             $8, "#index"   \n\t"\
677
    "cmp        "#dstw", "#index"   \n\t"\
678
    " jb             1b             \n\t"
679
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
680

    
681
#define REAL_WRITERGB15(dst, dstw, index) \
682
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
683
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
684
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
685
    "psrlq           $3, %%mm2  \n\t"\
686
    "psrlq           $1, %%mm5  \n\t"\
687
\
688
    "movq         %%mm2, %%mm1  \n\t"\
689
    "movq         %%mm4, %%mm3  \n\t"\
690
\
691
    "punpcklbw    %%mm7, %%mm3  \n\t"\
692
    "punpcklbw    %%mm5, %%mm2  \n\t"\
693
    "punpckhbw    %%mm7, %%mm4  \n\t"\
694
    "punpckhbw    %%mm5, %%mm1  \n\t"\
695
\
696
    "psllq           $2, %%mm3  \n\t"\
697
    "psllq           $2, %%mm4  \n\t"\
698
\
699
    "por          %%mm3, %%mm2  \n\t"\
700
    "por          %%mm4, %%mm1  \n\t"\
701
\
702
    MOVNTQ(%%mm2,  (dst, index, 2))\
703
    MOVNTQ(%%mm1, 8(dst, index, 2))\
704
\
705
    "add             $8, "#index"   \n\t"\
706
    "cmp        "#dstw", "#index"   \n\t"\
707
    " jb             1b             \n\t"
708
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
709

    
710
#define WRITEBGR24OLD(dst, dstw, index) \
711
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
712
    "movq      %%mm2, %%mm1             \n\t" /* B */\
713
    "movq      %%mm5, %%mm6             \n\t" /* R */\
714
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
715
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
716
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
717
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
718
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
719
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
720
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
721
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
722
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
723
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
724
\
725
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
726
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
727
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
728
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
729
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
730
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
731
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
732
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
733
\
734
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
735
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
736
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
737
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
738
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
739
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
740
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
741
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
742
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
743
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
744
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
745
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
746
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
747
\
748
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
749
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
750
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
751
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
752
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
753
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
754
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
755
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
756
\
757
    MOVNTQ(%%mm0,   (dst))\
758
    MOVNTQ(%%mm2,  8(dst))\
759
    MOVNTQ(%%mm3, 16(dst))\
760
    "add         $24, "#dst"            \n\t"\
761
\
762
    "add          $8, "#index"          \n\t"\
763
    "cmp     "#dstw", "#index"          \n\t"\
764
    " jb          1b                    \n\t"
765

    
766
#define WRITEBGR24MMX(dst, dstw, index) \
767
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
768
    "movq      %%mm2, %%mm1     \n\t" /* B */\
769
    "movq      %%mm5, %%mm6     \n\t" /* R */\
770
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
771
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
772
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
773
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
774
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
775
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
776
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
777
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
778
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
779
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
780
\
781
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
782
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
783
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
784
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
785
\
786
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
787
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
788
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
789
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
790
\
791
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
792
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
793
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
794
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
795
\
796
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
797
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
798
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
799
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
800
    MOVNTQ(%%mm0, (dst))\
801
\
802
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
803
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
804
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
805
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
806
    MOVNTQ(%%mm6, 8(dst))\
807
\
808
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
809
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
810
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
811
    MOVNTQ(%%mm5, 16(dst))\
812
\
813
    "add         $24, "#dst"    \n\t"\
814
\
815
    "add          $8, "#index"  \n\t"\
816
    "cmp     "#dstw", "#index"  \n\t"\
817
    " jb          1b            \n\t"
818

    
819
#define WRITEBGR24MMX2(dst, dstw, index) \
820
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
821
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
822
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
823
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
824
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
825
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
826
\
827
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
828
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
829
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
830
\
831
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
832
    "por    %%mm1, %%mm6        \n\t"\
833
    "por    %%mm3, %%mm6        \n\t"\
834
    MOVNTQ(%%mm6, (dst))\
835
\
836
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
837
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
838
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
839
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
840
\
841
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
842
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
843
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
844
\
845
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
846
    "por    %%mm3, %%mm6        \n\t"\
847
    MOVNTQ(%%mm6, 8(dst))\
848
\
849
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
850
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
851
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
852
\
853
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
854
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
855
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
856
\
857
    "por    %%mm1, %%mm3        \n\t"\
858
    "por    %%mm3, %%mm6        \n\t"\
859
    MOVNTQ(%%mm6, 16(dst))\
860
\
861
    "add      $24, "#dst"       \n\t"\
862
\
863
    "add       $8, "#index"     \n\t"\
864
    "cmp  "#dstw", "#index"     \n\t"\
865
    " jb       1b               \n\t"
866

    
867
#if COMPILE_TEMPLATE_MMX2
868
#undef WRITEBGR24
869
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
870
#else
871
#undef WRITEBGR24
872
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
873
#endif
874

    
875
#define REAL_WRITEYUY2(dst, dstw, index) \
876
    "packuswb  %%mm3, %%mm3     \n\t"\
877
    "packuswb  %%mm4, %%mm4     \n\t"\
878
    "packuswb  %%mm7, %%mm1     \n\t"\
879
    "punpcklbw %%mm4, %%mm3     \n\t"\
880
    "movq      %%mm1, %%mm7     \n\t"\
881
    "punpcklbw %%mm3, %%mm1     \n\t"\
882
    "punpckhbw %%mm3, %%mm7     \n\t"\
883
\
884
    MOVNTQ(%%mm1, (dst, index, 2))\
885
    MOVNTQ(%%mm7, 8(dst, index, 2))\
886
\
887
    "add          $8, "#index"  \n\t"\
888
    "cmp     "#dstw", "#index"  \n\t"\
889
    " jb          1b            \n\t"
890
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
891

    
892

    
893
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
894
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
895
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
896
{
897
    if(!(c->flags & SWS_BITEXACT)) {
898
        if (c->flags & SWS_ACCURATE_RND) {
899
            if (uDest) {
900
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
901
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
902
            }
903
            if (CONFIG_SWSCALE_ALPHA && aDest) {
904
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
905
            }
906

    
907
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
908
        } else {
909
            if (uDest) {
910
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
911
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
912
            }
913
            if (CONFIG_SWSCALE_ALPHA && aDest) {
914
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
915
            }
916

    
917
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
918
        }
919
        return;
920
    }
921
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
922
                chrFilter, chrSrc, chrFilterSize,
923
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
924
}
925

    
926
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
927
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
928
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
929
{
930
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
931
                 chrFilter, chrSrc, chrFilterSize,
932
                 dest, uDest, dstW, chrDstW, dstFormat);
933
}
934

    
935
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
936
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
937
{
938
    int i;
939
    if(!(c->flags & SWS_BITEXACT)) {
940
        long p= 4;
941
        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
942
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
943
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
944

    
945
        if (c->flags & SWS_ACCURATE_RND) {
946
            while(p--) {
947
                if (dst[p]) {
948
                    __asm__ volatile(
949
                        YSCALEYUV2YV121_ACCURATE
950
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
951
                        "g" (-counter[p])
952
                        : "%"REG_a
953
                    );
954
                }
955
            }
956
        } else {
957
            while(p--) {
958
                if (dst[p]) {
959
                    __asm__ volatile(
960
                        YSCALEYUV2YV121
961
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
962
                        "g" (-counter[p])
963
                        : "%"REG_a
964
                    );
965
                }
966
            }
967
        }
968
        return;
969
    }
970
    for (i=0; i<dstW; i++) {
971
        int val= (lumSrc[i]+64)>>7;
972

    
973
        if (val&256) {
974
            if (val<0) val=0;
975
            else       val=255;
976
        }
977

    
978
        dest[i]= val;
979
    }
980

    
981
    if (uDest)
982
        for (i=0; i<chrDstW; i++) {
983
            int u=(chrSrc[i       ]+64)>>7;
984
            int v=(chrSrc[i + VOFW]+64)>>7;
985

    
986
            if ((u|v)&256) {
987
                if (u<0)        u=0;
988
                else if (u>255) u=255;
989
                if (v<0)        v=0;
990
                else if (v>255) v=255;
991
            }
992

    
993
            uDest[i]= u;
994
            vDest[i]= v;
995
        }
996

    
997
    if (CONFIG_SWSCALE_ALPHA && aDest)
998
        for (i=0; i<dstW; i++) {
999
            int val= (alpSrc[i]+64)>>7;
1000
            aDest[i]= av_clip_uint8(val);
1001
        }
1002
}
1003

    
1004

    
1005
/**
1006
 * vertical scale YV12 to RGB
1007
 */
1008
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1009
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1010
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1011
{
1012
    x86_reg dummy=0;
1013
    x86_reg dstW_reg = dstW;
1014
    if(!(c->flags & SWS_BITEXACT)) {
1015
        if (c->flags & SWS_ACCURATE_RND) {
1016
            switch(c->dstFormat) {
1017
            case PIX_FMT_RGB32:
1018
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1019
                    YSCALEYUV2PACKEDX_ACCURATE
1020
                    YSCALEYUV2RGBX
1021
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1022
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1023
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1024
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1025
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1026
                    "psraw                        $3, %%mm1         \n\t"
1027
                    "psraw                        $3, %%mm7         \n\t"
1028
                    "packuswb                  %%mm7, %%mm1         \n\t"
1029
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1030

    
1031
                    YSCALEYUV2PACKEDX_END
1032
                } else {
1033
                    YSCALEYUV2PACKEDX_ACCURATE
1034
                    YSCALEYUV2RGBX
1035
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1036
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1037

    
1038
                    YSCALEYUV2PACKEDX_END
1039
                }
1040
                return;
1041
            case PIX_FMT_BGR24:
1042
                YSCALEYUV2PACKEDX_ACCURATE
1043
                YSCALEYUV2RGBX
1044
                "pxor %%mm7, %%mm7 \n\t"
1045
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1046
                "add %4, %%"REG_c"                        \n\t"
1047
                WRITEBGR24(%%REGc, %5, %%REGa)
1048

    
1049

    
1050
                :: "r" (&c->redDither),
1051
                "m" (dummy), "m" (dummy), "m" (dummy),
1052
                "r" (dest), "m" (dstW_reg)
1053
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1054
                );
1055
                return;
1056
            case PIX_FMT_RGB555:
1057
                YSCALEYUV2PACKEDX_ACCURATE
1058
                YSCALEYUV2RGBX
1059
                "pxor %%mm7, %%mm7 \n\t"
1060
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1061
#ifdef DITHER1XBPP
1062
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1063
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1064
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1065
#endif
1066

    
1067
                WRITERGB15(%4, %5, %%REGa)
1068
                YSCALEYUV2PACKEDX_END
1069
                return;
1070
            case PIX_FMT_RGB565:
1071
                YSCALEYUV2PACKEDX_ACCURATE
1072
                YSCALEYUV2RGBX
1073
                "pxor %%mm7, %%mm7 \n\t"
1074
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1075
#ifdef DITHER1XBPP
1076
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1077
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1078
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1079
#endif
1080

    
1081
                WRITERGB16(%4, %5, %%REGa)
1082
                YSCALEYUV2PACKEDX_END
1083
                return;
1084
            case PIX_FMT_YUYV422:
1085
                YSCALEYUV2PACKEDX_ACCURATE
1086
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1087

    
1088
                "psraw $3, %%mm3    \n\t"
1089
                "psraw $3, %%mm4    \n\t"
1090
                "psraw $3, %%mm1    \n\t"
1091
                "psraw $3, %%mm7    \n\t"
1092
                WRITEYUY2(%4, %5, %%REGa)
1093
                YSCALEYUV2PACKEDX_END
1094
                return;
1095
            }
1096
        } else {
1097
            switch(c->dstFormat) {
1098
            case PIX_FMT_RGB32:
1099
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1100
                    YSCALEYUV2PACKEDX
1101
                    YSCALEYUV2RGBX
1102
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1103
                    "psraw                        $3, %%mm1         \n\t"
1104
                    "psraw                        $3, %%mm7         \n\t"
1105
                    "packuswb                  %%mm7, %%mm1         \n\t"
1106
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1107
                    YSCALEYUV2PACKEDX_END
1108
                } else {
1109
                    YSCALEYUV2PACKEDX
1110
                    YSCALEYUV2RGBX
1111
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1112
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1113
                    YSCALEYUV2PACKEDX_END
1114
                }
1115
                return;
1116
            case PIX_FMT_BGR24:
1117
                YSCALEYUV2PACKEDX
1118
                YSCALEYUV2RGBX
1119
                "pxor                    %%mm7, %%mm7       \n\t"
1120
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1121
                "add                        %4, %%"REG_c"   \n\t"
1122
                WRITEBGR24(%%REGc, %5, %%REGa)
1123

    
1124
                :: "r" (&c->redDither),
1125
                "m" (dummy), "m" (dummy), "m" (dummy),
1126
                "r" (dest),  "m" (dstW_reg)
1127
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1128
                );
1129
                return;
1130
            case PIX_FMT_RGB555:
1131
                YSCALEYUV2PACKEDX
1132
                YSCALEYUV2RGBX
1133
                "pxor %%mm7, %%mm7 \n\t"
1134
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1135
#ifdef DITHER1XBPP
1136
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1137
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1138
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1139
#endif
1140

    
1141
                WRITERGB15(%4, %5, %%REGa)
1142
                YSCALEYUV2PACKEDX_END
1143
                return;
1144
            case PIX_FMT_RGB565:
1145
                YSCALEYUV2PACKEDX
1146
                YSCALEYUV2RGBX
1147
                "pxor %%mm7, %%mm7 \n\t"
1148
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
#ifdef DITHER1XBPP
1150
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1151
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1152
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1153
#endif
1154

    
1155
                WRITERGB16(%4, %5, %%REGa)
1156
                YSCALEYUV2PACKEDX_END
1157
                return;
1158
            case PIX_FMT_YUYV422:
1159
                YSCALEYUV2PACKEDX
1160
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1161

    
1162
                "psraw $3, %%mm3    \n\t"
1163
                "psraw $3, %%mm4    \n\t"
1164
                "psraw $3, %%mm1    \n\t"
1165
                "psraw $3, %%mm7    \n\t"
1166
                WRITEYUY2(%4, %5, %%REGa)
1167
                YSCALEYUV2PACKEDX_END
1168
                return;
1169
            }
1170
        }
1171
    }
1172
    yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1173
                   chrFilter, chrSrc, chrFilterSize,
1174
                   alpSrc, dest, dstW, dstY);
1175
}
1176

    
1177
/**
1178
 * vertical bilinear scale YV12 to RGB
1179
 */
1180
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1181
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1182
{
1183
    int  yalpha1=4095- yalpha;
1184
    int uvalpha1=4095-uvalpha;
1185
    int i;
1186

    
1187
    if(!(c->flags & SWS_BITEXACT)) {
1188
        switch(c->dstFormat) {
1189
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1190
        case PIX_FMT_RGB32:
1191
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1192
#if ARCH_X86_64
1193
                __asm__ volatile(
1194
                    YSCALEYUV2RGB(%%r8, %5)
1195
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1196
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1197
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1198
                    "packuswb            %%mm7, %%mm1       \n\t"
1199
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1200

    
1201
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1202
                    "a" (&c->redDither)
1203
                    ,"r" (abuf0), "r" (abuf1)
1204
                    : "%r8"
1205
                );
1206
#else
1207
                *(const uint16_t **)(&c->u_temp)=abuf0;
1208
                *(const uint16_t **)(&c->v_temp)=abuf1;
1209
                __asm__ volatile(
1210
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1211
                    "mov        %4, %%"REG_b"               \n\t"
1212
                    "push %%"REG_BP"                        \n\t"
1213
                    YSCALEYUV2RGB(%%REGBP, %5)
1214
                    "push                   %0              \n\t"
1215
                    "push                   %1              \n\t"
1216
                    "mov          "U_TEMP"(%5), %0          \n\t"
1217
                    "mov          "V_TEMP"(%5), %1          \n\t"
1218
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1219
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1220
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1221
                    "packuswb            %%mm7, %%mm1       \n\t"
1222
                    "pop                    %1              \n\t"
1223
                    "pop                    %0              \n\t"
1224
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1225
                    "pop %%"REG_BP"                         \n\t"
1226
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1227

    
1228
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1229
                    "a" (&c->redDither)
1230
                );
1231
#endif
1232
            } else {
1233
                __asm__ volatile(
1234
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1235
                    "mov        %4, %%"REG_b"               \n\t"
1236
                    "push %%"REG_BP"                        \n\t"
1237
                    YSCALEYUV2RGB(%%REGBP, %5)
1238
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1239
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1240
                    "pop %%"REG_BP"                         \n\t"
1241
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1242

    
1243
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1244
                    "a" (&c->redDither)
1245
                );
1246
            }
1247
            return;
1248
        case PIX_FMT_BGR24:
1249
            __asm__ volatile(
1250
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1251
                "mov        %4, %%"REG_b"               \n\t"
1252
                "push %%"REG_BP"                        \n\t"
1253
                YSCALEYUV2RGB(%%REGBP, %5)
1254
                "pxor    %%mm7, %%mm7                   \n\t"
1255
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1256
                "pop %%"REG_BP"                         \n\t"
1257
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1258
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1259
                "a" (&c->redDither)
1260
            );
1261
            return;
1262
        case PIX_FMT_RGB555:
1263
            __asm__ volatile(
1264
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1265
                "mov        %4, %%"REG_b"               \n\t"
1266
                "push %%"REG_BP"                        \n\t"
1267
                YSCALEYUV2RGB(%%REGBP, %5)
1268
                "pxor    %%mm7, %%mm7                   \n\t"
1269
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1270
#ifdef DITHER1XBPP
1271
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1272
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1273
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1274
#endif
1275

    
1276
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1277
                "pop %%"REG_BP"                         \n\t"
1278
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1279

    
1280
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1281
                "a" (&c->redDither)
1282
            );
1283
            return;
1284
        case PIX_FMT_RGB565:
1285
            __asm__ volatile(
1286
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1287
                "mov        %4, %%"REG_b"               \n\t"
1288
                "push %%"REG_BP"                        \n\t"
1289
                YSCALEYUV2RGB(%%REGBP, %5)
1290
                "pxor    %%mm7, %%mm7                   \n\t"
1291
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1292
#ifdef DITHER1XBPP
1293
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1294
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1295
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1296
#endif
1297

    
1298
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1299
                "pop %%"REG_BP"                         \n\t"
1300
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1301
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1302
                "a" (&c->redDither)
1303
            );
1304
            return;
1305
        case PIX_FMT_YUYV422:
1306
            __asm__ volatile(
1307
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1308
                "mov %4, %%"REG_b"                        \n\t"
1309
                "push %%"REG_BP"                        \n\t"
1310
                YSCALEYUV2PACKED(%%REGBP, %5)
1311
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1312
                "pop %%"REG_BP"                         \n\t"
1313
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1314
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1315
                "a" (&c->redDither)
1316
            );
1317
            return;
1318
        default: break;
1319
        }
1320
    }
1321
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1322
}
1323

    
1324
/**
1325
 * YV12 to RGB without scaling or interpolating
1326
 */
1327
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1328
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1329
{
1330
    const int yalpha1=0;
1331
    int i;
1332

    
1333
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1334
    const int yalpha= 4096; //FIXME ...
1335

    
1336
    if (flags&SWS_FULL_CHR_H_INT) {
1337
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1338
        return;
1339
    }
1340

    
1341
    if(!(flags & SWS_BITEXACT)) {
1342
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1343
            switch(dstFormat) {
1344
            case PIX_FMT_RGB32:
1345
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1346
                    __asm__ volatile(
1347
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1348
                        "mov        %4, %%"REG_b"               \n\t"
1349
                        "push %%"REG_BP"                        \n\t"
1350
                        YSCALEYUV2RGB1(%%REGBP, %5)
1351
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1352
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1353
                        "pop %%"REG_BP"                         \n\t"
1354
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1355

    
1356
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1357
                        "a" (&c->redDither)
1358
                    );
1359
                } else {
1360
                    __asm__ volatile(
1361
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1362
                        "mov        %4, %%"REG_b"               \n\t"
1363
                        "push %%"REG_BP"                        \n\t"
1364
                        YSCALEYUV2RGB1(%%REGBP, %5)
1365
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1366
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1367
                        "pop %%"REG_BP"                         \n\t"
1368
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1369

    
1370
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1371
                        "a" (&c->redDither)
1372
                    );
1373
                }
1374
                return;
1375
            case PIX_FMT_BGR24:
1376
                __asm__ volatile(
1377
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1378
                    "mov        %4, %%"REG_b"               \n\t"
1379
                    "push %%"REG_BP"                        \n\t"
1380
                    YSCALEYUV2RGB1(%%REGBP, %5)
1381
                    "pxor    %%mm7, %%mm7                   \n\t"
1382
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1383
                    "pop %%"REG_BP"                         \n\t"
1384
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1385

    
1386
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1387
                    "a" (&c->redDither)
1388
                );
1389
                return;
1390
            case PIX_FMT_RGB555:
1391
                __asm__ volatile(
1392
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1393
                    "mov        %4, %%"REG_b"               \n\t"
1394
                    "push %%"REG_BP"                        \n\t"
1395
                    YSCALEYUV2RGB1(%%REGBP, %5)
1396
                    "pxor    %%mm7, %%mm7                   \n\t"
1397
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1398
#ifdef DITHER1XBPP
1399
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1400
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1401
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1402
#endif
1403
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1404
                    "pop %%"REG_BP"                         \n\t"
1405
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1406

    
1407
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1408
                    "a" (&c->redDither)
1409
                );
1410
                return;
1411
            case PIX_FMT_RGB565:
1412
                __asm__ volatile(
1413
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1414
                    "mov        %4, %%"REG_b"               \n\t"
1415
                    "push %%"REG_BP"                        \n\t"
1416
                    YSCALEYUV2RGB1(%%REGBP, %5)
1417
                    "pxor    %%mm7, %%mm7                   \n\t"
1418
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1419
#ifdef DITHER1XBPP
1420
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1421
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1422
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1423
#endif
1424

    
1425
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1426
                    "pop %%"REG_BP"                         \n\t"
1427
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1428

    
1429
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1430
                    "a" (&c->redDither)
1431
                );
1432
                return;
1433
            case PIX_FMT_YUYV422:
1434
                __asm__ volatile(
1435
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1436
                    "mov        %4, %%"REG_b"               \n\t"
1437
                    "push %%"REG_BP"                        \n\t"
1438
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1439
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1440
                    "pop %%"REG_BP"                         \n\t"
1441
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1442

    
1443
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1444
                    "a" (&c->redDither)
1445
                );
1446
                return;
1447
            }
1448
        } else {
1449
            switch(dstFormat) {
1450
            case PIX_FMT_RGB32:
1451
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1452
                    __asm__ volatile(
1453
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1454
                        "mov        %4, %%"REG_b"               \n\t"
1455
                        "push %%"REG_BP"                        \n\t"
1456
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1457
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1458
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1459
                        "pop %%"REG_BP"                         \n\t"
1460
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1461

    
1462
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463
                        "a" (&c->redDither)
1464
                    );
1465
                } else {
1466
                    __asm__ volatile(
1467
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1468
                        "mov        %4, %%"REG_b"               \n\t"
1469
                        "push %%"REG_BP"                        \n\t"
1470
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1471
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1472
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1473
                        "pop %%"REG_BP"                         \n\t"
1474
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1475

    
1476
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477
                        "a" (&c->redDither)
1478
                    );
1479
                }
1480
                return;
1481
            case PIX_FMT_BGR24:
1482
                __asm__ volatile(
1483
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1484
                    "mov        %4, %%"REG_b"               \n\t"
1485
                    "push %%"REG_BP"                        \n\t"
1486
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1487
                    "pxor    %%mm7, %%mm7                   \n\t"
1488
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1489
                    "pop %%"REG_BP"                         \n\t"
1490
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1491

    
1492
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1493
                    "a" (&c->redDither)
1494
                );
1495
                return;
1496
            case PIX_FMT_RGB555:
1497
                __asm__ volatile(
1498
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1499
                    "mov        %4, %%"REG_b"               \n\t"
1500
                    "push %%"REG_BP"                        \n\t"
1501
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1502
                    "pxor    %%mm7, %%mm7                   \n\t"
1503
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1504
#ifdef DITHER1XBPP
1505
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1506
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1507
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1508
#endif
1509
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1510
                    "pop %%"REG_BP"                         \n\t"
1511
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1512

    
1513
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514
                    "a" (&c->redDither)
1515
                );
1516
                return;
1517
            case PIX_FMT_RGB565:
1518
                __asm__ volatile(
1519
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1520
                    "mov        %4, %%"REG_b"               \n\t"
1521
                    "push %%"REG_BP"                        \n\t"
1522
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1523
                    "pxor    %%mm7, %%mm7                   \n\t"
1524
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1525
#ifdef DITHER1XBPP
1526
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1527
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1528
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1529
#endif
1530

    
1531
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1532
                    "pop %%"REG_BP"                         \n\t"
1533
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1534

    
1535
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1536
                    "a" (&c->redDither)
1537
                );
1538
                return;
1539
            case PIX_FMT_YUYV422:
1540
                __asm__ volatile(
1541
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1542
                    "mov        %4, %%"REG_b"               \n\t"
1543
                    "push %%"REG_BP"                        \n\t"
1544
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1545
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1546
                    "pop %%"REG_BP"                         \n\t"
1547
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1548

    
1549
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1550
                    "a" (&c->redDither)
1551
                );
1552
                return;
1553
            }
1554
        }
1555
    }
1556
    if (uvalpha < 2048) {
1557
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1558
    } else {
1559
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1560
    }
1561
}
1562

    
1563
//FIXME yuy2* can read up to 7 samples too much
1564

    
1565
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1566
{
1567
    __asm__ volatile(
1568
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1569
        "mov                    %0, %%"REG_a"       \n\t"
1570
        "1:                                         \n\t"
1571
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1572
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1573
        "pand                %%mm2, %%mm0           \n\t"
1574
        "pand                %%mm2, %%mm1           \n\t"
1575
        "packuswb            %%mm1, %%mm0           \n\t"
1576
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1577
        "add                    $8, %%"REG_a"       \n\t"
1578
        " js                    1b                  \n\t"
1579
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1580
        : "%"REG_a
1581
    );
1582
}
1583

    
1584
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1585
{
1586
    __asm__ volatile(
1587
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1588
        "mov                    %0, %%"REG_a"       \n\t"
1589
        "1:                                         \n\t"
1590
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1591
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1592
        "psrlw                  $8, %%mm0           \n\t"
1593
        "psrlw                  $8, %%mm1           \n\t"
1594
        "packuswb            %%mm1, %%mm0           \n\t"
1595
        "movq                %%mm0, %%mm1           \n\t"
1596
        "psrlw                  $8, %%mm0           \n\t"
1597
        "pand                %%mm4, %%mm1           \n\t"
1598
        "packuswb            %%mm0, %%mm0           \n\t"
1599
        "packuswb            %%mm1, %%mm1           \n\t"
1600
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1601
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1602
        "add                    $4, %%"REG_a"       \n\t"
1603
        " js                    1b                  \n\t"
1604
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1605
        : "%"REG_a
1606
    );
1607
    assert(src1 == src2);
1608
}
1609

    
1610
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1611
{
1612
    __asm__ volatile(
1613
        "mov                    %0, %%"REG_a"       \n\t"
1614
        "1:                                         \n\t"
1615
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1616
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1617
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1618
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1619
        "psrlw                  $8, %%mm0           \n\t"
1620
        "psrlw                  $8, %%mm1           \n\t"
1621
        "psrlw                  $8, %%mm2           \n\t"
1622
        "psrlw                  $8, %%mm3           \n\t"
1623
        "packuswb            %%mm1, %%mm0           \n\t"
1624
        "packuswb            %%mm3, %%mm2           \n\t"
1625
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1626
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1627
        "add                    $8, %%"REG_a"       \n\t"
1628
        " js                    1b                  \n\t"
1629
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1630
        : "%"REG_a
1631
    );
1632
}
1633

    
1634
/* This is almost identical to the previous, end exists only because
1635
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1636
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1637
{
1638
    __asm__ volatile(
1639
        "mov                  %0, %%"REG_a"         \n\t"
1640
        "1:                                         \n\t"
1641
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1642
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1643
        "psrlw                $8, %%mm0             \n\t"
1644
        "psrlw                $8, %%mm1             \n\t"
1645
        "packuswb          %%mm1, %%mm0             \n\t"
1646
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1647
        "add                  $8, %%"REG_a"         \n\t"
1648
        " js                  1b                    \n\t"
1649
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1650
        : "%"REG_a
1651
    );
1652
}
1653

    
1654
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1655
{
1656
    __asm__ volatile(
1657
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1658
        "mov                    %0, %%"REG_a"       \n\t"
1659
        "1:                                         \n\t"
1660
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1661
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1662
        "pand                %%mm4, %%mm0           \n\t"
1663
        "pand                %%mm4, %%mm1           \n\t"
1664
        "packuswb            %%mm1, %%mm0           \n\t"
1665
        "movq                %%mm0, %%mm1           \n\t"
1666
        "psrlw                  $8, %%mm0           \n\t"
1667
        "pand                %%mm4, %%mm1           \n\t"
1668
        "packuswb            %%mm0, %%mm0           \n\t"
1669
        "packuswb            %%mm1, %%mm1           \n\t"
1670
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1671
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1672
        "add                    $4, %%"REG_a"       \n\t"
1673
        " js                    1b                  \n\t"
1674
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1675
        : "%"REG_a
1676
    );
1677
    assert(src1 == src2);
1678
}
1679

    
1680
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1681
{
1682
    __asm__ volatile(
1683
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1684
        "mov                    %0, %%"REG_a"       \n\t"
1685
        "1:                                         \n\t"
1686
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1687
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1688
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1689
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1690
        "pand                %%mm4, %%mm0           \n\t"
1691
        "pand                %%mm4, %%mm1           \n\t"
1692
        "pand                %%mm4, %%mm2           \n\t"
1693
        "pand                %%mm4, %%mm3           \n\t"
1694
        "packuswb            %%mm1, %%mm0           \n\t"
1695
        "packuswb            %%mm3, %%mm2           \n\t"
1696
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1697
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1698
        "add                    $8, %%"REG_a"       \n\t"
1699
        " js                    1b                  \n\t"
1700
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1701
        : "%"REG_a
1702
    );
1703
}
1704

    
1705
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1706
                                    const uint8_t *src, long width)
1707
{
1708
    __asm__ volatile(
1709
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1710
        "mov                    %0, %%"REG_a"       \n\t"
1711
        "1:                                         \n\t"
1712
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1713
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1714
        "movq                %%mm0, %%mm2           \n\t"
1715
        "movq                %%mm1, %%mm3           \n\t"
1716
        "pand                %%mm4, %%mm0           \n\t"
1717
        "pand                %%mm4, %%mm1           \n\t"
1718
        "psrlw                  $8, %%mm2           \n\t"
1719
        "psrlw                  $8, %%mm3           \n\t"
1720
        "packuswb            %%mm1, %%mm0           \n\t"
1721
        "packuswb            %%mm3, %%mm2           \n\t"
1722
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1723
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1724
        "add                    $8, %%"REG_a"       \n\t"
1725
        " js                    1b                  \n\t"
1726
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1727
        : "%"REG_a
1728
    );
1729
}
1730

    
1731
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1732
                                    const uint8_t *src1, const uint8_t *src2,
1733
                                    long width, uint32_t *unused)
1734
{
1735
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1736
}
1737

    
1738
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1739
                                    const uint8_t *src1, const uint8_t *src2,
1740
                                    long width, uint32_t *unused)
1741
{
1742
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1743
}
1744

    
1745
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1746
{
1747

    
1748
    if(srcFormat == PIX_FMT_BGR24) {
1749
        __asm__ volatile(
1750
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1751
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1752
            :
1753
        );
1754
    } else {
1755
        __asm__ volatile(
1756
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1757
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1758
            :
1759
        );
1760
    }
1761

    
1762
    __asm__ volatile(
1763
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1764
        "mov                        %2, %%"REG_a"   \n\t"
1765
        "pxor                    %%mm7, %%mm7       \n\t"
1766
        "1:                                         \n\t"
1767
        PREFETCH"               64(%0)              \n\t"
1768
        "movd                     (%0), %%mm0       \n\t"
1769
        "movd                    2(%0), %%mm1       \n\t"
1770
        "movd                    6(%0), %%mm2       \n\t"
1771
        "movd                    8(%0), %%mm3       \n\t"
1772
        "add                       $12, %0          \n\t"
1773
        "punpcklbw               %%mm7, %%mm0       \n\t"
1774
        "punpcklbw               %%mm7, %%mm1       \n\t"
1775
        "punpcklbw               %%mm7, %%mm2       \n\t"
1776
        "punpcklbw               %%mm7, %%mm3       \n\t"
1777
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1778
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1779
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1780
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1781
        "paddd                   %%mm1, %%mm0       \n\t"
1782
        "paddd                   %%mm3, %%mm2       \n\t"
1783
        "paddd                   %%mm4, %%mm0       \n\t"
1784
        "paddd                   %%mm4, %%mm2       \n\t"
1785
        "psrad                     $15, %%mm0       \n\t"
1786
        "psrad                     $15, %%mm2       \n\t"
1787
        "packssdw                %%mm2, %%mm0       \n\t"
1788
        "packuswb                %%mm0, %%mm0       \n\t"
1789
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1790
        "add                        $4, %%"REG_a"   \n\t"
1791
        " js                        1b              \n\t"
1792
    : "+r" (src)
1793
    : "r" (dst+width), "g" ((x86_reg)-width)
1794
    : "%"REG_a
1795
    );
1796
}
1797

    
1798
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1799
{
1800
    __asm__ volatile(
1801
        "movq                    24(%4), %%mm6       \n\t"
1802
        "mov                        %3, %%"REG_a"   \n\t"
1803
        "pxor                    %%mm7, %%mm7       \n\t"
1804
        "1:                                         \n\t"
1805
        PREFETCH"               64(%0)              \n\t"
1806
        "movd                     (%0), %%mm0       \n\t"
1807
        "movd                    2(%0), %%mm1       \n\t"
1808
        "punpcklbw               %%mm7, %%mm0       \n\t"
1809
        "punpcklbw               %%mm7, %%mm1       \n\t"
1810
        "movq                    %%mm0, %%mm2       \n\t"
1811
        "movq                    %%mm1, %%mm3       \n\t"
1812
        "pmaddwd                  (%4), %%mm0       \n\t"
1813
        "pmaddwd                 8(%4), %%mm1       \n\t"
1814
        "pmaddwd                16(%4), %%mm2       \n\t"
1815
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1816
        "paddd                   %%mm1, %%mm0       \n\t"
1817
        "paddd                   %%mm3, %%mm2       \n\t"
1818

    
1819
        "movd                    6(%0), %%mm1       \n\t"
1820
        "movd                    8(%0), %%mm3       \n\t"
1821
        "add                       $12, %0          \n\t"
1822
        "punpcklbw               %%mm7, %%mm1       \n\t"
1823
        "punpcklbw               %%mm7, %%mm3       \n\t"
1824
        "movq                    %%mm1, %%mm4       \n\t"
1825
        "movq                    %%mm3, %%mm5       \n\t"
1826
        "pmaddwd                  (%4), %%mm1       \n\t"
1827
        "pmaddwd                 8(%4), %%mm3       \n\t"
1828
        "pmaddwd                16(%4), %%mm4       \n\t"
1829
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1830
        "paddd                   %%mm3, %%mm1       \n\t"
1831
        "paddd                   %%mm5, %%mm4       \n\t"
1832

    
1833
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1834
        "paddd                   %%mm3, %%mm0       \n\t"
1835
        "paddd                   %%mm3, %%mm2       \n\t"
1836
        "paddd                   %%mm3, %%mm1       \n\t"
1837
        "paddd                   %%mm3, %%mm4       \n\t"
1838
        "psrad                     $15, %%mm0       \n\t"
1839
        "psrad                     $15, %%mm2       \n\t"
1840
        "psrad                     $15, %%mm1       \n\t"
1841
        "psrad                     $15, %%mm4       \n\t"
1842
        "packssdw                %%mm1, %%mm0       \n\t"
1843
        "packssdw                %%mm4, %%mm2       \n\t"
1844
        "packuswb                %%mm0, %%mm0       \n\t"
1845
        "packuswb                %%mm2, %%mm2       \n\t"
1846
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1847
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1848
        "add                        $4, %%"REG_a"   \n\t"
1849
        " js                        1b              \n\t"
1850
    : "+r" (src)
1851
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1852
    : "%"REG_a
1853
    );
1854
}
1855

    
1856
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1857
{
1858
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1859
}
1860

    
1861
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1862
{
1863
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1864
    assert(src1 == src2);
1865
}
1866

    
1867
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1868
{
1869
    int i;
1870
    for (i=0; i<width; i++) {
1871
        int b= src1[6*i + 0] + src1[6*i + 3];
1872
        int g= src1[6*i + 1] + src1[6*i + 4];
1873
        int r= src1[6*i + 2] + src1[6*i + 5];
1874

    
1875
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1876
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1877
    }
1878
    assert(src1 == src2);
1879
}
1880

    
1881
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1882
{
1883
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1884
}
1885

    
1886
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1887
{
1888
    assert(src1==src2);
1889
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1890
}
1891

    
1892
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1893
{
1894
    int i;
1895
    assert(src1==src2);
1896
    for (i=0; i<width; i++) {
1897
        int r= src1[6*i + 0] + src1[6*i + 3];
1898
        int g= src1[6*i + 1] + src1[6*i + 4];
1899
        int b= src1[6*i + 2] + src1[6*i + 5];
1900

    
1901
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1902
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1903
    }
1904
}
1905

    
1906

    
1907
// bilinear / bicubic scaling
1908
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
1909
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
1910
{
1911
    assert(filterSize % 4 == 0 && filterSize>0);
1912
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
1913
        x86_reg counter= -2*dstW;
1914
        filter-= counter*2;
1915
        filterPos-= counter/2;
1916
        dst-= counter/2;
1917
        __asm__ volatile(
1918
#if defined(PIC)
1919
            "push            %%"REG_b"              \n\t"
1920
#endif
1921
            "pxor                %%mm7, %%mm7       \n\t"
1922
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
1923
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
1924
            ".p2align                4              \n\t"
1925
            "1:                                     \n\t"
1926
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
1927
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
1928
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
1929
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
1930
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
1931
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
1932
            "punpcklbw           %%mm7, %%mm0       \n\t"
1933
            "punpcklbw           %%mm7, %%mm2       \n\t"
1934
            "pmaddwd             %%mm1, %%mm0       \n\t"
1935
            "pmaddwd             %%mm2, %%mm3       \n\t"
1936
            "movq                %%mm0, %%mm4       \n\t"
1937
            "punpckldq           %%mm3, %%mm0       \n\t"
1938
            "punpckhdq           %%mm3, %%mm4       \n\t"
1939
            "paddd               %%mm4, %%mm0       \n\t"
1940
            "psrad                  $7, %%mm0       \n\t"
1941
            "packssdw            %%mm0, %%mm0       \n\t"
1942
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
1943
            "add                    $4, %%"REG_BP"  \n\t"
1944
            " jnc                   1b              \n\t"
1945

    
1946
            "pop            %%"REG_BP"              \n\t"
1947
#if defined(PIC)
1948
            "pop             %%"REG_b"              \n\t"
1949
#endif
1950
            : "+a" (counter)
1951
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
1952
#if !defined(PIC)
1953
            : "%"REG_b
1954
#endif
1955
        );
1956
    } else if (filterSize==8) {
1957
        x86_reg counter= -2*dstW;
1958
        filter-= counter*4;
1959
        filterPos-= counter/2;
1960
        dst-= counter/2;
1961
        __asm__ volatile(
1962
#if defined(PIC)
1963
            "push             %%"REG_b"             \n\t"
1964
#endif
1965
            "pxor                 %%mm7, %%mm7      \n\t"
1966
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
1967
            "mov              %%"REG_a", %%"REG_BP" \n\t"
1968
            ".p2align                 4             \n\t"
1969
            "1:                                     \n\t"
1970
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
1971
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
1972
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
1973
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
1974
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
1975
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
1976
            "punpcklbw            %%mm7, %%mm0      \n\t"
1977
            "punpcklbw            %%mm7, %%mm2      \n\t"
1978
            "pmaddwd              %%mm1, %%mm0      \n\t"
1979
            "pmaddwd              %%mm2, %%mm3      \n\t"
1980

    
1981
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
1982
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
1983
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
1984
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
1985
            "punpcklbw            %%mm7, %%mm4      \n\t"
1986
            "punpcklbw            %%mm7, %%mm2      \n\t"
1987
            "pmaddwd              %%mm1, %%mm4      \n\t"
1988
            "pmaddwd              %%mm2, %%mm5      \n\t"
1989
            "paddd                %%mm4, %%mm0      \n\t"
1990
            "paddd                %%mm5, %%mm3      \n\t"
1991
            "movq                 %%mm0, %%mm4      \n\t"
1992
            "punpckldq            %%mm3, %%mm0      \n\t"
1993
            "punpckhdq            %%mm3, %%mm4      \n\t"
1994
            "paddd                %%mm4, %%mm0      \n\t"
1995
            "psrad                   $7, %%mm0      \n\t"
1996
            "packssdw             %%mm0, %%mm0      \n\t"
1997
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
1998
            "add                     $4, %%"REG_BP" \n\t"
1999
            " jnc                    1b             \n\t"
2000

    
2001
            "pop             %%"REG_BP"             \n\t"
2002
#if defined(PIC)
2003
            "pop              %%"REG_b"             \n\t"
2004
#endif
2005
            : "+a" (counter)
2006
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2007
#if !defined(PIC)
2008
            : "%"REG_b
2009
#endif
2010
        );
2011
    } else {
2012
        const uint8_t *offset = src+filterSize;
2013
        x86_reg counter= -2*dstW;
2014
        //filter-= counter*filterSize/2;
2015
        filterPos-= counter/2;
2016
        dst-= counter/2;
2017
        __asm__ volatile(
2018
            "pxor                  %%mm7, %%mm7     \n\t"
2019
            ".p2align                  4            \n\t"
2020
            "1:                                     \n\t"
2021
            "mov                      %2, %%"REG_c" \n\t"
2022
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2023
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2024
            "mov                      %5, %%"REG_c" \n\t"
2025
            "pxor                  %%mm4, %%mm4     \n\t"
2026
            "pxor                  %%mm5, %%mm5     \n\t"
2027
            "2:                                     \n\t"
2028
            "movq                   (%1), %%mm1     \n\t"
2029
            "movq               (%1, %6), %%mm3     \n\t"
2030
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2031
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2032
            "punpcklbw             %%mm7, %%mm0     \n\t"
2033
            "punpcklbw             %%mm7, %%mm2     \n\t"
2034
            "pmaddwd               %%mm1, %%mm0     \n\t"
2035
            "pmaddwd               %%mm2, %%mm3     \n\t"
2036
            "paddd                 %%mm3, %%mm5     \n\t"
2037
            "paddd                 %%mm0, %%mm4     \n\t"
2038
            "add                      $8, %1        \n\t"
2039
            "add                      $4, %%"REG_c" \n\t"
2040
            "cmp                      %4, %%"REG_c" \n\t"
2041
            " jb                      2b            \n\t"
2042
            "add                      %6, %1        \n\t"
2043
            "movq                  %%mm4, %%mm0     \n\t"
2044
            "punpckldq             %%mm5, %%mm4     \n\t"
2045
            "punpckhdq             %%mm5, %%mm0     \n\t"
2046
            "paddd                 %%mm0, %%mm4     \n\t"
2047
            "psrad                    $7, %%mm4     \n\t"
2048
            "packssdw              %%mm4, %%mm4     \n\t"
2049
            "mov                      %3, %%"REG_a" \n\t"
2050
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2051
            "add                      $4, %0        \n\t"
2052
            " jnc                     1b            \n\t"
2053

    
2054
            : "+r" (counter), "+r" (filter)
2055
            : "m" (filterPos), "m" (dst), "m"(offset),
2056
            "m" (src), "r" ((x86_reg)filterSize*2)
2057
            : "%"REG_a, "%"REG_c, "%"REG_d
2058
        );
2059
    }
2060
}
2061

    
2062
//FIXME all pal and rgb srcFormats could do this convertion as well
2063
//FIXME all scalers more complex than bilinear could do half of this transform
2064
static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2065
{
2066
    int i;
2067
    for (i = 0; i < width; i++) {
2068
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2069
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2070
    }
2071
}
2072
static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2073
{
2074
    int i;
2075
    for (i = 0; i < width; i++) {
2076
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2077
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2078
    }
2079
}
2080
static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2081
{
2082
    int i;
2083
    for (i = 0; i < width; i++)
2084
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2085
}
2086
static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2087
{
2088
    int i;
2089
    for (i = 0; i < width; i++)
2090
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2091
}
2092

    
2093
#define FAST_BILINEAR_X86 \
2094
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2095
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2096
    "shll      $16, %%edi    \n\t"                                              \
2097
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2098
    "mov        %1, %%"REG_D"\n\t"                                              \
2099
    "shrl       $9, %%esi    \n\t"                                              \
2100

    
2101
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2102
                                        long dstWidth, const uint8_t *src, int srcW,
2103
                                        int xInc)
2104
{
2105
#if COMPILE_TEMPLATE_MMX2
2106
    int32_t *filterPos = c->hLumFilterPos;
2107
    int16_t *filter    = c->hLumFilter;
2108
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2109
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2110
    int i;
2111
#if defined(PIC)
2112
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2113
#endif
2114
    if (canMMX2BeUsed) {
2115
        __asm__ volatile(
2116
#if defined(PIC)
2117
            "mov               %%"REG_b", %5        \n\t"
2118
#endif
2119
            "pxor                  %%mm7, %%mm7     \n\t"
2120
            "mov                      %0, %%"REG_c" \n\t"
2121
            "mov                      %1, %%"REG_D" \n\t"
2122
            "mov                      %2, %%"REG_d" \n\t"
2123
            "mov                      %3, %%"REG_b" \n\t"
2124
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2125
            PREFETCH"        (%%"REG_c")            \n\t"
2126
            PREFETCH"      32(%%"REG_c")            \n\t"
2127
            PREFETCH"      64(%%"REG_c")            \n\t"
2128

    
2129
#if ARCH_X86_64
2130

    
2131
#define CALL_MMX2_FILTER_CODE \
2132
            "movl            (%%"REG_b"), %%esi     \n\t"\
2133
            "call                    *%4            \n\t"\
2134
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2135
            "add               %%"REG_S", %%"REG_c" \n\t"\
2136
            "add               %%"REG_a", %%"REG_D" \n\t"\
2137
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2138

    
2139
#else
2140

    
2141
#define CALL_MMX2_FILTER_CODE \
2142
            "movl (%%"REG_b"), %%esi        \n\t"\
2143
            "call         *%4                       \n\t"\
2144
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2145
            "add               %%"REG_a", %%"REG_D" \n\t"\
2146
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2147

    
2148
#endif /* ARCH_X86_64 */
2149

    
2150
            CALL_MMX2_FILTER_CODE
2151
            CALL_MMX2_FILTER_CODE
2152
            CALL_MMX2_FILTER_CODE
2153
            CALL_MMX2_FILTER_CODE
2154
            CALL_MMX2_FILTER_CODE
2155
            CALL_MMX2_FILTER_CODE
2156
            CALL_MMX2_FILTER_CODE
2157
            CALL_MMX2_FILTER_CODE
2158

    
2159
#if defined(PIC)
2160
            "mov                      %5, %%"REG_b" \n\t"
2161
#endif
2162
            :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2163
            "m" (mmx2FilterCode)
2164
#if defined(PIC)
2165
            ,"m" (ebxsave)
2166
#endif
2167
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2168
#if !defined(PIC)
2169
            ,"%"REG_b
2170
#endif
2171
        );
2172
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2173
    } else {
2174
#endif /* COMPILE_TEMPLATE_MMX2 */
2175
    x86_reg xInc_shr16 = xInc >> 16;
2176
    uint16_t xInc_mask = xInc & 0xffff;
2177
    x86_reg dstWidth_reg = dstWidth;
2178
    //NO MMX just normal asm ...
2179
    __asm__ volatile(
2180
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2181
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2182
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2183
        ".p2align                4           \n\t"
2184
        "1:                                  \n\t"
2185
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2186
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2187
        FAST_BILINEAR_X86
2188
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2189
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2190
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2191

    
2192
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2193
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2194
        FAST_BILINEAR_X86
2195
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2196
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2197
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2198

    
2199

    
2200
        "add        $2, %%"REG_a"            \n\t"
2201
        "cmp        %2, %%"REG_a"            \n\t"
2202
        " jb        1b                       \n\t"
2203

    
2204

    
2205
        :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2206
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2207
    );
2208
#if COMPILE_TEMPLATE_MMX2
2209
    } //if MMX2 can't be used
2210
#endif
2211
}
2212

    
2213
      // *** horizontal scale Y line to temp buffer
2214
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2215
                                   const int16_t *hLumFilter,
2216
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2217
                                   uint8_t *formatConvBuffer,
2218
                                   uint32_t *pal, int isAlpha)
2219
{
2220
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2221
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2222

    
2223
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2224

    
2225
    if (toYV12) {
2226
        toYV12(formatConvBuffer, src, srcW, pal);
2227
        src= formatConvBuffer;
2228
    }
2229

    
2230
    if (!c->hyscale_fast) {
2231
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2232
    } else { // fast bilinear upscale / crap downscale
2233
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2234
    }
2235

    
2236
    if (convertRange)
2237
        convertRange(dst, dstWidth);
2238
}
2239

    
2240
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2241
                                        long dstWidth, const uint8_t *src1,
2242
                                        const uint8_t *src2, int srcW, int xInc)
2243
{
2244
#if COMPILE_TEMPLATE_MMX2
2245
    int32_t *filterPos = c->hChrFilterPos;
2246
    int16_t *filter    = c->hChrFilter;
2247
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2248
    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2249
    int i;
2250
#if defined(PIC)
2251
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2252
#endif
2253
    if (canMMX2BeUsed) {
2254
        __asm__ volatile(
2255
#if defined(PIC)
2256
            "mov          %%"REG_b", %6         \n\t"
2257
#endif
2258
            "pxor             %%mm7, %%mm7      \n\t"
2259
            "mov                 %0, %%"REG_c"  \n\t"
2260
            "mov                 %1, %%"REG_D"  \n\t"
2261
            "mov                 %2, %%"REG_d"  \n\t"
2262
            "mov                 %3, %%"REG_b"  \n\t"
2263
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2264
            PREFETCH"   (%%"REG_c")             \n\t"
2265
            PREFETCH" 32(%%"REG_c")             \n\t"
2266
            PREFETCH" 64(%%"REG_c")             \n\t"
2267

    
2268
            CALL_MMX2_FILTER_CODE
2269
            CALL_MMX2_FILTER_CODE
2270
            CALL_MMX2_FILTER_CODE
2271
            CALL_MMX2_FILTER_CODE
2272
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2273
            "mov                 %5, %%"REG_c"  \n\t" // src
2274
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2275
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2276
            PREFETCH"   (%%"REG_c")             \n\t"
2277
            PREFETCH" 32(%%"REG_c")             \n\t"
2278
            PREFETCH" 64(%%"REG_c")             \n\t"
2279

    
2280
            CALL_MMX2_FILTER_CODE
2281
            CALL_MMX2_FILTER_CODE
2282
            CALL_MMX2_FILTER_CODE
2283
            CALL_MMX2_FILTER_CODE
2284

    
2285
#if defined(PIC)
2286
            "mov %6, %%"REG_b"    \n\t"
2287
#endif
2288
            :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2289
            "m" (mmx2FilterCode), "m" (src2)
2290
#if defined(PIC)
2291
            ,"m" (ebxsave)
2292
#endif
2293
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2294
#if !defined(PIC)
2295
            ,"%"REG_b
2296
#endif
2297
        );
2298
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2299
            //printf("%d %d %d\n", dstWidth, i, srcW);
2300
            dst[i] = src1[srcW-1]*128;
2301
            dst[i+VOFW] = src2[srcW-1]*128;
2302
        }
2303
    } else {
2304
#endif /* COMPILE_TEMPLATE_MMX2 */
2305
        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2306
        uint16_t xInc_mask = xInc & 0xffff;
2307
        x86_reg dstWidth_reg = dstWidth;
2308
        __asm__ volatile(
2309
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2310
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2311
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2312
            ".p2align    4                          \n\t"
2313
            "1:                                     \n\t"
2314
            "mov        %0, %%"REG_S"               \n\t"
2315
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2316
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2317
            FAST_BILINEAR_X86
2318
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2319

    
2320
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2321
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2322
            FAST_BILINEAR_X86
2323
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2324

    
2325
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2326
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2327
            "add        $1, %%"REG_a"               \n\t"
2328
            "cmp        %2, %%"REG_a"               \n\t"
2329
            " jb        1b                          \n\t"
2330

    
2331
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2332
which is needed to support GCC 4.0. */
2333
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2334
            :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2335
#else
2336
            :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2337
#endif
2338
            "r" (src2)
2339
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2340
        );
2341
#if COMPILE_TEMPLATE_MMX2
2342
    } //if MMX2 can't be used
2343
#endif
2344
}
2345

    
2346
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2347
                                   int srcW, int xInc, const int16_t *hChrFilter,
2348
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2349
                                   uint8_t *formatConvBuffer,
2350
                                   uint32_t *pal)
2351
{
2352

    
2353
    src1 += c->chrSrcOffset;
2354
    src2 += c->chrSrcOffset;
2355

    
2356
    if (c->chrToYV12) {
2357
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2358
        src1= formatConvBuffer;
2359
        src2= formatConvBuffer+VOFW;
2360
    }
2361

    
2362
    if (!c->hcscale_fast) {
2363
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2364
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2365
    } else { // fast bilinear upscale / crap downscale
2366
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2367
    }
2368

    
2369
    if (c->chrConvertRange)
2370
        c->chrConvertRange(dst, dstWidth);
2371
}
2372

    
2373
#define DEBUG_SWSCALE_BUFFERS 0
2374
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2375

    
2376
static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2377
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2378
{
2379
    /* load a few things into local vars to make the code more readable? and faster */
2380
    const int srcW= c->srcW;
2381
    const int dstW= c->dstW;
2382
    const int dstH= c->dstH;
2383
    const int chrDstW= c->chrDstW;
2384
    const int chrSrcW= c->chrSrcW;
2385
    const int lumXInc= c->lumXInc;
2386
    const int chrXInc= c->chrXInc;
2387
    const enum PixelFormat dstFormat= c->dstFormat;
2388
    const int flags= c->flags;
2389
    int16_t *vLumFilterPos= c->vLumFilterPos;
2390
    int16_t *vChrFilterPos= c->vChrFilterPos;
2391
    int16_t *hLumFilterPos= c->hLumFilterPos;
2392
    int16_t *hChrFilterPos= c->hChrFilterPos;
2393
    int16_t *vLumFilter= c->vLumFilter;
2394
    int16_t *vChrFilter= c->vChrFilter;
2395
    int16_t *hLumFilter= c->hLumFilter;
2396
    int16_t *hChrFilter= c->hChrFilter;
2397
    int32_t *lumMmxFilter= c->lumMmxFilter;
2398
    int32_t *chrMmxFilter= c->chrMmxFilter;
2399
    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2400
    const int vLumFilterSize= c->vLumFilterSize;
2401
    const int vChrFilterSize= c->vChrFilterSize;
2402
    const int hLumFilterSize= c->hLumFilterSize;
2403
    const int hChrFilterSize= c->hChrFilterSize;
2404
    int16_t **lumPixBuf= c->lumPixBuf;
2405
    int16_t **chrPixBuf= c->chrPixBuf;
2406
    int16_t **alpPixBuf= c->alpPixBuf;
2407
    const int vLumBufSize= c->vLumBufSize;
2408
    const int vChrBufSize= c->vChrBufSize;
2409
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2410
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2411
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2412
    int lastDstY;
2413
    uint32_t *pal=c->pal_yuv;
2414

    
2415
    /* vars which will change and which we need to store back in the context */
2416
    int dstY= c->dstY;
2417
    int lumBufIndex= c->lumBufIndex;
2418
    int chrBufIndex= c->chrBufIndex;
2419
    int lastInLumBuf= c->lastInLumBuf;
2420
    int lastInChrBuf= c->lastInChrBuf;
2421

    
2422
    if (isPacked(c->srcFormat)) {
2423
        src[0]=
2424
        src[1]=
2425
        src[2]=
2426
        src[3]= src[0];
2427
        srcStride[0]=
2428
        srcStride[1]=
2429
        srcStride[2]=
2430
        srcStride[3]= srcStride[0];
2431
    }
2432
    srcStride[1]<<= c->vChrDrop;
2433
    srcStride[2]<<= c->vChrDrop;
2434

    
2435
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2436
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2437
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2438
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2439
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2440
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2441
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2442

    
2443
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2444
        static int warnedAlready=0; //FIXME move this into the context perhaps
2445
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2446
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2447
                   "         ->cannot do aligned memory accesses anymore\n");
2448
            warnedAlready=1;
2449
        }
2450
    }
2451

    
2452
    /* Note the user might start scaling the picture in the middle so this
2453
       will not get executed. This is not really intended but works
2454
       currently, so people might do it. */
2455
    if (srcSliceY ==0) {
2456
        lumBufIndex=-1;
2457
        chrBufIndex=-1;
2458
        dstY=0;
2459
        lastInLumBuf= -1;
2460
        lastInChrBuf= -1;
2461
    }
2462

    
2463
    lastDstY= dstY;
2464

    
2465
    for (;dstY < dstH; dstY++) {
2466
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2467
        const int chrDstY= dstY>>c->chrDstVSubSample;
2468
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2469
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2470
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2471

    
2472
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2473
        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2474
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2475
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2476
        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2477
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2478
        int enough_lines;
2479

    
2480
        //handle holes (FAST_BILINEAR & weird filters)
2481
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2482
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2483
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2484
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2485

    
2486
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2487
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2488
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2489
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2490
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2491

    
2492
        // Do we have enough lines in this slice to output the dstY line
2493
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2494

    
2495
        if (!enough_lines) {
2496
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2497
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2498
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2499
                                            lastLumSrcY, lastChrSrcY);
2500
        }
2501

    
2502
        //Do horizontal scaling
2503
        while(lastInLumBuf < lastLumSrcY) {
2504
            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2505
            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2506
            lumBufIndex++;
2507
            assert(lumBufIndex < 2*vLumBufSize);
2508
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2509
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2510
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2511
                            hLumFilter, hLumFilterPos, hLumFilterSize,
2512
                            formatConvBuffer,
2513
                            pal, 0);
2514
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2515
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2516
                                hLumFilter, hLumFilterPos, hLumFilterSize,
2517
                                formatConvBuffer,
2518
                                pal, 1);
2519
            lastInLumBuf++;
2520
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2521
                               lumBufIndex,    lastInLumBuf);
2522
        }
2523
        while(lastInChrBuf < lastChrSrcY) {
2524
            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2525
            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2526
            chrBufIndex++;
2527
            assert(chrBufIndex < 2*vChrBufSize);
2528
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2529
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2530
            //FIXME replace parameters through context struct (some at least)
2531

    
2532
            if (c->needs_hcscale)
2533
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2534
                                hChrFilter, hChrFilterPos, hChrFilterSize,
2535
                                formatConvBuffer,
2536
                                pal);
2537
            lastInChrBuf++;
2538
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2539
                               chrBufIndex,    lastInChrBuf);
2540
        }
2541
        //wrap buf index around to stay inside the ring buffer
2542
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2543
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2544
        if (!enough_lines)
2545
            break; //we can't output a dstY line so let's try with the next slice
2546

    
2547
        c->blueDither= ff_dither8[dstY&1];
2548
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2549
            c->greenDither= ff_dither8[dstY&1];
2550
        else
2551
            c->greenDither= ff_dither4[dstY&1];
2552
        c->redDither= ff_dither8[(dstY+1)&1];
2553
        if (dstY < dstH-2) {
2554
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2555
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2556
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2557
            int i;
2558
            if (flags & SWS_ACCURATE_RND) {
2559
                int s= APCK_SIZE / 8;
2560
                for (i=0; i<vLumFilterSize; i+=2) {
2561
                    *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2562
                    *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2563
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2564
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2565
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2566
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2567
                        *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2568
                        *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2569
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2570
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2571
                    }
2572
                }
2573
                for (i=0; i<vChrFilterSize; i+=2) {
2574
                    *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2575
                    *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2576
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2577
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2578
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2579
                }
2580
            } else {
2581
                for (i=0; i<vLumFilterSize; i++) {
2582
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2583
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2584
                    lumMmxFilter[4*i+2]=
2585
                    lumMmxFilter[4*i+3]=
2586
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2587
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2588
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2589
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2590
                        alpMmxFilter[4*i+2]=
2591
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2592
                    }
2593
                }
2594
                for (i=0; i<vChrFilterSize; i++) {
2595
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2596
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2597
                    chrMmxFilter[4*i+2]=
2598
                    chrMmxFilter[4*i+3]=
2599
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2600
                }
2601
            }
2602
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2603
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2604
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2605
                c->yuv2nv12X(c,
2606
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2607
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2608
                             dest, uDest, dstW, chrDstW, dstFormat);
2609
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2610
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2611
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2612
                if (is16BPS(dstFormat)) {
2613
                    yuv2yuvX16inC(
2614
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2615
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2616
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2617
                                  dstFormat);
2618
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2619
                    const int16_t *lumBuf = lumSrcPtr[0];
2620
                    const int16_t *chrBuf= chrSrcPtr[0];
2621
                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2622
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2623
                } else { //General YV12
2624
                    c->yuv2yuvX(c,
2625
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2626
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2627
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2628
                }
2629
            } else {
2630
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2631
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2632
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2633
                    int chrAlpha= vChrFilter[2*dstY+1];
2634
                    if(flags & SWS_FULL_CHR_H_INT) {
2635
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2636
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2637
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2638
                                         alpSrcPtr, dest, dstW, dstY);
2639
                    } else {
2640
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2641
                                       alpPixBuf ? *alpSrcPtr : NULL,
2642
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2643
                    }
2644
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2645
                    int lumAlpha= vLumFilter[2*dstY+1];
2646
                    int chrAlpha= vChrFilter[2*dstY+1];
2647
                    lumMmxFilter[2]=
2648
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2649
                    chrMmxFilter[2]=
2650
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2651
                    if(flags & SWS_FULL_CHR_H_INT) {
2652
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2653
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2654
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2655
                                         alpSrcPtr, dest, dstW, dstY);
2656
                    } else {
2657
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2658
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2659
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2660
                    }
2661
                } else { //general RGB
2662
                    if(flags & SWS_FULL_CHR_H_INT) {
2663
                        yuv2rgbXinC_full(c,
2664
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2665
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2666
                                         alpSrcPtr, dest, dstW, dstY);
2667
                    } else {
2668
                        c->yuv2packedX(c,
2669
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2670
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2671
                                       alpSrcPtr, dest, dstW, dstY);
2672
                    }
2673
                }
2674
            }
2675
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2676
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2677
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2678
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2679
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2680
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2681
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2682
                yuv2nv12XinC(
2683
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2684
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2685
                             dest, uDest, dstW, chrDstW, dstFormat);
2686
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2687
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2688
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2689
                if (is16BPS(dstFormat)) {
2690
                    yuv2yuvX16inC(
2691
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2692
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2693
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2694
                                  dstFormat);
2695
                } else {
2696
                    yuv2yuvXinC(
2697
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2698
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2699
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2700
                }
2701
            } else {
2702
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2703
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2704
                if(flags & SWS_FULL_CHR_H_INT) {
2705
                    yuv2rgbXinC_full(c,
2706
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2707
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2708
                                     alpSrcPtr, dest, dstW, dstY);
2709
                } else {
2710
                    yuv2packedXinC(c,
2711
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2712
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2713
                                   alpSrcPtr, dest, dstW, dstY);
2714
                }
2715
            }
2716
        }
2717
    }
2718

    
2719
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2720
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2721

    
2722
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2723
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2724
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2725
    else                             __asm__ volatile("emms"  :::"memory");
2726
    /* store changed local vars back in the context */
2727
    c->dstY= dstY;
2728
    c->lumBufIndex= lumBufIndex;
2729
    c->chrBufIndex= chrBufIndex;
2730
    c->lastInLumBuf= lastInLumBuf;
2731
    c->lastInChrBuf= lastInChrBuf;
2732

    
2733
    return dstY - lastDstY;
2734
}
2735

    
2736
static void RENAME(sws_init_swScale)(SwsContext *c)
2737
{
2738
    enum PixelFormat srcFormat = c->srcFormat;
2739

    
2740
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2741
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2742
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2743
    c->yuv2packed1  = RENAME(yuv2packed1 );
2744
    c->yuv2packed2  = RENAME(yuv2packed2 );
2745
    c->yuv2packedX  = RENAME(yuv2packedX );
2746

    
2747
    c->hScale       = RENAME(hScale      );
2748

    
2749
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2750
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2751
    {
2752
        c->hyscale_fast = RENAME(hyscale_fast);
2753
        c->hcscale_fast = RENAME(hcscale_fast);
2754
    } else {
2755
        c->hyscale_fast = NULL;
2756
        c->hcscale_fast = NULL;
2757
    }
2758

    
2759
    switch(srcFormat) {
2760
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2761
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2762
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2763
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2764
        case PIX_FMT_YUV420P16BE:
2765
        case PIX_FMT_YUV422P16BE:
2766
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2767
        case PIX_FMT_YUV420P16LE:
2768
        case PIX_FMT_YUV422P16LE:
2769
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2770
        default: break;
2771
    }
2772
    if (c->chrSrcHSubSample) {
2773
        switch(srcFormat) {
2774
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2775
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2776
        default: break;
2777
        }
2778
    } else {
2779
        switch(srcFormat) {
2780
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2781
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
2782
        default: break;
2783
        }
2784
    }
2785

    
2786
    switch (srcFormat) {
2787
    case PIX_FMT_YUYV422  :
2788
    case PIX_FMT_YUV420P16BE:
2789
    case PIX_FMT_YUV422P16BE:
2790
    case PIX_FMT_YUV444P16BE:
2791
    case PIX_FMT_Y400A    :
2792
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
2793
    case PIX_FMT_UYVY422  :
2794
    case PIX_FMT_YUV420P16LE:
2795
    case PIX_FMT_YUV422P16LE:
2796
    case PIX_FMT_YUV444P16LE:
2797
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
2798
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
2799
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
2800
    default: break;
2801
    }
2802
    if (c->alpPixBuf) {
2803
        switch (srcFormat) {
2804
        case PIX_FMT_Y400A  : c->alpToYV12 = RENAME(yuy2ToY); break;
2805
        default: break;
2806
        }
2807
    }
2808

    
2809
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2810
        if (c->srcRange) {
2811
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
2812
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
2813
        } else {
2814
            c->lumConvertRange = RENAME(lumRangeToJpeg);
2815
            c->chrConvertRange = RENAME(chrRangeToJpeg);
2816
        }
2817
    }
2818
}