Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 80264dc4

History | View | Annotate | Download (140 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#undef REAL_MOVNTQ
22
#undef MOVNTQ
23
#undef PAVGB
24
#undef PREFETCH
25

    
26
#if COMPILE_TEMPLATE_AMD3DNOW
27
#define PREFETCH  "prefetch"
28
#elif COMPILE_TEMPLATE_MMX2
29
#define PREFETCH "prefetchnta"
30
#else
31
#define PREFETCH  " # nop"
32
#endif
33

    
34
#if COMPILE_TEMPLATE_MMX2
35
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36
#elif COMPILE_TEMPLATE_AMD3DNOW
37
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38
#endif
39

    
40
#if COMPILE_TEMPLATE_MMX2
41
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42
#else
43
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44
#endif
45
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46

    
47
#if COMPILE_TEMPLATE_ALTIVEC
48
#include "ppc/swscale_altivec_template.c"
49
#endif
50

    
51
#define YSCALEYUV2YV12X(x, offset, dest, width) \
52
    __asm__ volatile(\
53
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
54
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
55
        "movq                             %%mm3, %%mm4      \n\t"\
56
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
57
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
58
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
59
        "1:                                                 \n\t"\
60
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
61
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
62
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
63
        "add                                $16, %%"REG_d"  \n\t"\
64
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
66
        "pmulhw                           %%mm0, %%mm2      \n\t"\
67
        "pmulhw                           %%mm0, %%mm5      \n\t"\
68
        "paddw                            %%mm2, %%mm3      \n\t"\
69
        "paddw                            %%mm5, %%mm4      \n\t"\
70
        " jnz                                1b             \n\t"\
71
        "psraw                               $3, %%mm3      \n\t"\
72
        "psraw                               $3, %%mm4      \n\t"\
73
        "packuswb                         %%mm4, %%mm3      \n\t"\
74
        MOVNTQ(%%mm3, (%1, %%REGa))\
75
        "add                                 $8, %%"REG_a"  \n\t"\
76
        "cmp                                 %2, %%"REG_a"  \n\t"\
77
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
78
        "movq                             %%mm3, %%mm4      \n\t"\
79
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
80
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
81
        "jb                                  1b             \n\t"\
82
        :: "r" (&c->redDither),\
83
        "r" (dest), "g" ((x86_reg)width)\
84
        : "%"REG_a, "%"REG_d, "%"REG_S\
85
    );
86

    
87
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88
    __asm__ volatile(\
89
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
90
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
91
        "pxor                             %%mm4, %%mm4      \n\t"\
92
        "pxor                             %%mm5, %%mm5      \n\t"\
93
        "pxor                             %%mm6, %%mm6      \n\t"\
94
        "pxor                             %%mm7, %%mm7      \n\t"\
95
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
96
        ".p2align                             4             \n\t"\
97
        "1:                                                 \n\t"\
98
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
99
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
100
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
101
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
102
        "movq                             %%mm0, %%mm3      \n\t"\
103
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
104
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
105
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
106
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
107
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
108
        "paddd                            %%mm0, %%mm4      \n\t"\
109
        "paddd                            %%mm3, %%mm5      \n\t"\
110
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
111
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
112
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
113
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
114
        "movq                             %%mm2, %%mm0      \n\t"\
115
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
116
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
117
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
118
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
119
        "paddd                            %%mm2, %%mm6      \n\t"\
120
        "paddd                            %%mm0, %%mm7      \n\t"\
121
        " jnz                                1b             \n\t"\
122
        "psrad                              $16, %%mm4      \n\t"\
123
        "psrad                              $16, %%mm5      \n\t"\
124
        "psrad                              $16, %%mm6      \n\t"\
125
        "psrad                              $16, %%mm7      \n\t"\
126
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
127
        "packssdw                         %%mm5, %%mm4      \n\t"\
128
        "packssdw                         %%mm7, %%mm6      \n\t"\
129
        "paddw                            %%mm0, %%mm4      \n\t"\
130
        "paddw                            %%mm0, %%mm6      \n\t"\
131
        "psraw                               $3, %%mm4      \n\t"\
132
        "psraw                               $3, %%mm6      \n\t"\
133
        "packuswb                         %%mm6, %%mm4      \n\t"\
134
        MOVNTQ(%%mm4, (%1, %%REGa))\
135
        "add                                 $8, %%"REG_a"  \n\t"\
136
        "cmp                                 %2, %%"REG_a"  \n\t"\
137
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
138
        "pxor                             %%mm4, %%mm4      \n\t"\
139
        "pxor                             %%mm5, %%mm5      \n\t"\
140
        "pxor                             %%mm6, %%mm6      \n\t"\
141
        "pxor                             %%mm7, %%mm7      \n\t"\
142
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
143
        "jb                                  1b             \n\t"\
144
        :: "r" (&c->redDither),\
145
        "r" (dest), "g" ((x86_reg)width)\
146
        : "%"REG_a, "%"REG_d, "%"REG_S\
147
    );
148

    
149
#define YSCALEYUV2YV121 \
150
    "mov %2, %%"REG_a"                    \n\t"\
151
    ".p2align               4             \n\t" /* FIXME Unroll? */\
152
    "1:                                   \n\t"\
153
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
154
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
155
    "psraw                 $7, %%mm0      \n\t"\
156
    "psraw                 $7, %%mm1      \n\t"\
157
    "packuswb           %%mm1, %%mm0      \n\t"\
158
    MOVNTQ(%%mm0, (%1, %%REGa))\
159
    "add                   $8, %%"REG_a"  \n\t"\
160
    "jnc                   1b             \n\t"
161

    
162
#define YSCALEYUV2YV121_ACCURATE \
163
    "mov %2, %%"REG_a"                    \n\t"\
164
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
165
    "psrlw                 $15, %%mm7     \n\t"\
166
    "psllw                  $6, %%mm7     \n\t"\
167
    ".p2align                4            \n\t" /* FIXME Unroll? */\
168
    "1:                                   \n\t"\
169
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
170
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
171
    "paddsw             %%mm7, %%mm0      \n\t"\
172
    "paddsw             %%mm7, %%mm1      \n\t"\
173
    "psraw                 $7, %%mm0      \n\t"\
174
    "psraw                 $7, %%mm1      \n\t"\
175
    "packuswb           %%mm1, %%mm0      \n\t"\
176
    MOVNTQ(%%mm0, (%1, %%REGa))\
177
    "add                   $8, %%"REG_a"  \n\t"\
178
    "jnc                   1b             \n\t"
179

    
180
/*
181
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183
       "r" (dest), "m" (dstW_reg),
184
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186
*/
187
#define YSCALEYUV2PACKEDX_UV \
188
    __asm__ volatile(\
189
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
190
        ".p2align                      4                \n\t"\
191
        "nop                                            \n\t"\
192
        "1:                                             \n\t"\
193
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
194
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
195
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
196
        "movq                      %%mm3, %%mm4         \n\t"\
197
        ".p2align                      4                \n\t"\
198
        "2:                                             \n\t"\
199
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
200
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
201
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
202
        "add                         $16, %%"REG_d"     \n\t"\
203
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
204
        "pmulhw                    %%mm0, %%mm2         \n\t"\
205
        "pmulhw                    %%mm0, %%mm5         \n\t"\
206
        "paddw                     %%mm2, %%mm3         \n\t"\
207
        "paddw                     %%mm5, %%mm4         \n\t"\
208
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
209
        " jnz                         2b                \n\t"\
210

    
211
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
213
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
214
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
215
    "movq                    "#dst1", "#dst2"       \n\t"\
216
    ".p2align                      4                \n\t"\
217
    "2:                                             \n\t"\
218
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
219
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
220
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
221
    "add                         $16, %%"REG_d"            \n\t"\
222
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
223
    "pmulhw                 "#coeff", "#src1"       \n\t"\
224
    "pmulhw                 "#coeff", "#src2"       \n\t"\
225
    "paddw                   "#src1", "#dst1"       \n\t"\
226
    "paddw                   "#src2", "#dst2"       \n\t"\
227
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
228
    " jnz                         2b                \n\t"\
229

    
230
#define YSCALEYUV2PACKEDX \
231
    YSCALEYUV2PACKEDX_UV \
232
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
233

    
234
#define YSCALEYUV2PACKEDX_END                     \
235
        :: "r" (&c->redDither),                   \
236
            "m" (dummy), "m" (dummy), "m" (dummy),\
237
            "r" (dest), "m" (dstW_reg)            \
238
        : "%"REG_a, "%"REG_d, "%"REG_S            \
239
    );
240

    
241
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
242
    __asm__ volatile(\
243
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
244
        ".p2align                      4                \n\t"\
245
        "nop                                            \n\t"\
246
        "1:                                             \n\t"\
247
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
248
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
249
        "pxor                      %%mm4, %%mm4         \n\t"\
250
        "pxor                      %%mm5, %%mm5         \n\t"\
251
        "pxor                      %%mm6, %%mm6         \n\t"\
252
        "pxor                      %%mm7, %%mm7         \n\t"\
253
        ".p2align                      4                \n\t"\
254
        "2:                                             \n\t"\
255
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
256
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
257
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
258
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
259
        "movq                      %%mm0, %%mm3         \n\t"\
260
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
261
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
262
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
263
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
264
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
265
        "paddd                     %%mm0, %%mm4         \n\t"\
266
        "paddd                     %%mm3, %%mm5         \n\t"\
267
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
268
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
269
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
270
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
271
        "movq                      %%mm2, %%mm0         \n\t"\
272
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
273
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
274
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
275
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
276
        "paddd                     %%mm2, %%mm6         \n\t"\
277
        "paddd                     %%mm0, %%mm7         \n\t"\
278
        " jnz                         2b                \n\t"\
279
        "psrad                       $16, %%mm4         \n\t"\
280
        "psrad                       $16, %%mm5         \n\t"\
281
        "psrad                       $16, %%mm6         \n\t"\
282
        "psrad                       $16, %%mm7         \n\t"\
283
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
284
        "packssdw                  %%mm5, %%mm4         \n\t"\
285
        "packssdw                  %%mm7, %%mm6         \n\t"\
286
        "paddw                     %%mm0, %%mm4         \n\t"\
287
        "paddw                     %%mm0, %%mm6         \n\t"\
288
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
289
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
290

    
291
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
293
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
294
    "pxor                      %%mm1, %%mm1         \n\t"\
295
    "pxor                      %%mm5, %%mm5         \n\t"\
296
    "pxor                      %%mm7, %%mm7         \n\t"\
297
    "pxor                      %%mm6, %%mm6         \n\t"\
298
    ".p2align                      4                \n\t"\
299
    "2:                                             \n\t"\
300
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
301
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
302
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
303
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
304
    "movq                      %%mm0, %%mm3         \n\t"\
305
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
306
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
307
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
308
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
309
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
310
    "paddd                     %%mm0, %%mm1         \n\t"\
311
    "paddd                     %%mm3, %%mm5         \n\t"\
312
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
313
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
314
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
315
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
316
    "movq                      %%mm2, %%mm0         \n\t"\
317
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
318
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
319
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
320
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
321
    "paddd                     %%mm2, %%mm7         \n\t"\
322
    "paddd                     %%mm0, %%mm6         \n\t"\
323
    " jnz                         2b                \n\t"\
324
    "psrad                       $16, %%mm1         \n\t"\
325
    "psrad                       $16, %%mm5         \n\t"\
326
    "psrad                       $16, %%mm7         \n\t"\
327
    "psrad                       $16, %%mm6         \n\t"\
328
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
329
    "packssdw                  %%mm5, %%mm1         \n\t"\
330
    "packssdw                  %%mm6, %%mm7         \n\t"\
331
    "paddw                     %%mm0, %%mm1         \n\t"\
332
    "paddw                     %%mm0, %%mm7         \n\t"\
333
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
334
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
335

    
336
#define YSCALEYUV2PACKEDX_ACCURATE \
337
    YSCALEYUV2PACKEDX_ACCURATE_UV \
338
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
339

    
340
#define YSCALEYUV2RGBX \
341
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
342
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
343
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
344
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
345
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
346
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
347
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
349
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
350
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
351
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
352
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
353
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
354
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355
    "paddw           %%mm3, %%mm4       \n\t"\
356
    "movq            %%mm2, %%mm0       \n\t"\
357
    "movq            %%mm5, %%mm6       \n\t"\
358
    "movq            %%mm4, %%mm3       \n\t"\
359
    "punpcklwd       %%mm2, %%mm2       \n\t"\
360
    "punpcklwd       %%mm5, %%mm5       \n\t"\
361
    "punpcklwd       %%mm4, %%mm4       \n\t"\
362
    "paddw           %%mm1, %%mm2       \n\t"\
363
    "paddw           %%mm1, %%mm5       \n\t"\
364
    "paddw           %%mm1, %%mm4       \n\t"\
365
    "punpckhwd       %%mm0, %%mm0       \n\t"\
366
    "punpckhwd       %%mm6, %%mm6       \n\t"\
367
    "punpckhwd       %%mm3, %%mm3       \n\t"\
368
    "paddw           %%mm7, %%mm0       \n\t"\
369
    "paddw           %%mm7, %%mm6       \n\t"\
370
    "paddw           %%mm7, %%mm3       \n\t"\
371
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372
    "packuswb        %%mm0, %%mm2       \n\t"\
373
    "packuswb        %%mm6, %%mm5       \n\t"\
374
    "packuswb        %%mm3, %%mm4       \n\t"\
375

    
376
#define REAL_YSCALEYUV2PACKED(index, c) \
377
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
378
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
379
    "psraw                $3, %%mm0                           \n\t"\
380
    "psraw                $3, %%mm1                           \n\t"\
381
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383
    "xor            "#index", "#index"                        \n\t"\
384
    ".p2align              4            \n\t"\
385
    "1:                                 \n\t"\
386
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
387
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
388
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
389
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
390
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
393
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
400
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
401
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
402
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
403
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
404
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
411

    
412
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
413

    
414
#define REAL_YSCALEYUV2RGB_UV(index, c) \
415
    "xor            "#index", "#index"  \n\t"\
416
    ".p2align              4            \n\t"\
417
    "1:                                 \n\t"\
418
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
419
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
420
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
421
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
422
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
425
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
432
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
433
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
434
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
435
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
436
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
437
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
438

    
439
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
441
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
442
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
443
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
444
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
445
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
446
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
452

    
453
#define REAL_YSCALEYUV2RGB_COEFF(c) \
454
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
455
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
456
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
457
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
458
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
459
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
460
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461
    "paddw             %%mm3, %%mm4     \n\t"\
462
    "movq              %%mm2, %%mm0     \n\t"\
463
    "movq              %%mm5, %%mm6     \n\t"\
464
    "movq              %%mm4, %%mm3     \n\t"\
465
    "punpcklwd         %%mm2, %%mm2     \n\t"\
466
    "punpcklwd         %%mm5, %%mm5     \n\t"\
467
    "punpcklwd         %%mm4, %%mm4     \n\t"\
468
    "paddw             %%mm1, %%mm2     \n\t"\
469
    "paddw             %%mm1, %%mm5     \n\t"\
470
    "paddw             %%mm1, %%mm4     \n\t"\
471
    "punpckhwd         %%mm0, %%mm0     \n\t"\
472
    "punpckhwd         %%mm6, %%mm6     \n\t"\
473
    "punpckhwd         %%mm3, %%mm3     \n\t"\
474
    "paddw             %%mm7, %%mm0     \n\t"\
475
    "paddw             %%mm7, %%mm6     \n\t"\
476
    "paddw             %%mm7, %%mm3     \n\t"\
477
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478
    "packuswb          %%mm0, %%mm2     \n\t"\
479
    "packuswb          %%mm6, %%mm5     \n\t"\
480
    "packuswb          %%mm3, %%mm4     \n\t"\
481

    
482
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
483

    
484
#define YSCALEYUV2RGB(index, c) \
485
    REAL_YSCALEYUV2RGB_UV(index, c) \
486
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487
    REAL_YSCALEYUV2RGB_COEFF(c)
488

    
489
#define REAL_YSCALEYUV2PACKED1(index, c) \
490
    "xor            "#index", "#index"  \n\t"\
491
    ".p2align              4            \n\t"\
492
    "1:                                 \n\t"\
493
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
494
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
495
    "psraw                $7, %%mm3     \n\t" \
496
    "psraw                $7, %%mm4     \n\t" \
497
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
498
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
499
    "psraw                $7, %%mm1     \n\t" \
500
    "psraw                $7, %%mm7     \n\t" \
501

    
502
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
503

    
504
#define REAL_YSCALEYUV2RGB1(index, c) \
505
    "xor            "#index", "#index"  \n\t"\
506
    ".p2align              4            \n\t"\
507
    "1:                                 \n\t"\
508
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
509
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
510
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
513
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
514
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
515
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
516
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
517
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
518
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
524
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
525
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
526
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
527
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
528
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
529
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530
    "paddw             %%mm3, %%mm4     \n\t"\
531
    "movq              %%mm2, %%mm0     \n\t"\
532
    "movq              %%mm5, %%mm6     \n\t"\
533
    "movq              %%mm4, %%mm3     \n\t"\
534
    "punpcklwd         %%mm2, %%mm2     \n\t"\
535
    "punpcklwd         %%mm5, %%mm5     \n\t"\
536
    "punpcklwd         %%mm4, %%mm4     \n\t"\
537
    "paddw             %%mm1, %%mm2     \n\t"\
538
    "paddw             %%mm1, %%mm5     \n\t"\
539
    "paddw             %%mm1, %%mm4     \n\t"\
540
    "punpckhwd         %%mm0, %%mm0     \n\t"\
541
    "punpckhwd         %%mm6, %%mm6     \n\t"\
542
    "punpckhwd         %%mm3, %%mm3     \n\t"\
543
    "paddw             %%mm7, %%mm0     \n\t"\
544
    "paddw             %%mm7, %%mm6     \n\t"\
545
    "paddw             %%mm7, %%mm3     \n\t"\
546
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547
    "packuswb          %%mm0, %%mm2     \n\t"\
548
    "packuswb          %%mm6, %%mm5     \n\t"\
549
    "packuswb          %%mm3, %%mm4     \n\t"\
550

    
551
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
552

    
553
#define REAL_YSCALEYUV2PACKED1b(index, c) \
554
    "xor "#index", "#index"             \n\t"\
555
    ".p2align              4            \n\t"\
556
    "1:                                 \n\t"\
557
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
558
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
559
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
560
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
561
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563
    "psrlw                $8, %%mm3     \n\t" \
564
    "psrlw                $8, %%mm4     \n\t" \
565
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
566
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
567
    "psraw                $7, %%mm1     \n\t" \
568
    "psraw                $7, %%mm7     \n\t"
569
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
570

    
571
// do vertical chrominance interpolation
572
#define REAL_YSCALEYUV2RGB1b(index, c) \
573
    "xor            "#index", "#index"  \n\t"\
574
    ".p2align              4            \n\t"\
575
    "1:                                 \n\t"\
576
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
577
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
578
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
579
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
580
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
583
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
584
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
585
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
586
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
587
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
588
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
589
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
590
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
592
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
593
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
596
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
597
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
598
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
599
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
600
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
601
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602
    "paddw             %%mm3, %%mm4     \n\t"\
603
    "movq              %%mm2, %%mm0     \n\t"\
604
    "movq              %%mm5, %%mm6     \n\t"\
605
    "movq              %%mm4, %%mm3     \n\t"\
606
    "punpcklwd         %%mm2, %%mm2     \n\t"\
607
    "punpcklwd         %%mm5, %%mm5     \n\t"\
608
    "punpcklwd         %%mm4, %%mm4     \n\t"\
609
    "paddw             %%mm1, %%mm2     \n\t"\
610
    "paddw             %%mm1, %%mm5     \n\t"\
611
    "paddw             %%mm1, %%mm4     \n\t"\
612
    "punpckhwd         %%mm0, %%mm0     \n\t"\
613
    "punpckhwd         %%mm6, %%mm6     \n\t"\
614
    "punpckhwd         %%mm3, %%mm3     \n\t"\
615
    "paddw             %%mm7, %%mm0     \n\t"\
616
    "paddw             %%mm7, %%mm6     \n\t"\
617
    "paddw             %%mm7, %%mm3     \n\t"\
618
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619
    "packuswb          %%mm0, %%mm2     \n\t"\
620
    "packuswb          %%mm6, %%mm5     \n\t"\
621
    "packuswb          %%mm3, %%mm4     \n\t"\
622

    
623
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
624

    
625
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
627
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
628
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
629
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
630
    "packuswb          %%mm1, %%mm7     \n\t"
631
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
632

    
633
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634
    "movq       "#b", "#q2"     \n\t" /* B */\
635
    "movq       "#r", "#t"      \n\t" /* R */\
636
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
637
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
638
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
639
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
640
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
641
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
642
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
643
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
644
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
645
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
646
\
647
    MOVNTQ(   q0,   (dst, index, 4))\
648
    MOVNTQ(    b,  8(dst, index, 4))\
649
    MOVNTQ(   q2, 16(dst, index, 4))\
650
    MOVNTQ(   q3, 24(dst, index, 4))\
651
\
652
    "add      $8, "#index"      \n\t"\
653
    "cmp "#dstw", "#index"      \n\t"\
654
    " jb      1b                \n\t"
655
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
656

    
657
#define REAL_WRITERGB16(dst, dstw, index) \
658
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
659
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
660
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
661
    "psrlq           $3, %%mm2  \n\t"\
662
\
663
    "movq         %%mm2, %%mm1  \n\t"\
664
    "movq         %%mm4, %%mm3  \n\t"\
665
\
666
    "punpcklbw    %%mm7, %%mm3  \n\t"\
667
    "punpcklbw    %%mm5, %%mm2  \n\t"\
668
    "punpckhbw    %%mm7, %%mm4  \n\t"\
669
    "punpckhbw    %%mm5, %%mm1  \n\t"\
670
\
671
    "psllq           $3, %%mm3  \n\t"\
672
    "psllq           $3, %%mm4  \n\t"\
673
\
674
    "por          %%mm3, %%mm2  \n\t"\
675
    "por          %%mm4, %%mm1  \n\t"\
676
\
677
    MOVNTQ(%%mm2,  (dst, index, 2))\
678
    MOVNTQ(%%mm1, 8(dst, index, 2))\
679
\
680
    "add             $8, "#index"   \n\t"\
681
    "cmp        "#dstw", "#index"   \n\t"\
682
    " jb             1b             \n\t"
683
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
684

    
685
#define REAL_WRITERGB15(dst, dstw, index) \
686
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
687
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
688
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
689
    "psrlq           $3, %%mm2  \n\t"\
690
    "psrlq           $1, %%mm5  \n\t"\
691
\
692
    "movq         %%mm2, %%mm1  \n\t"\
693
    "movq         %%mm4, %%mm3  \n\t"\
694
\
695
    "punpcklbw    %%mm7, %%mm3  \n\t"\
696
    "punpcklbw    %%mm5, %%mm2  \n\t"\
697
    "punpckhbw    %%mm7, %%mm4  \n\t"\
698
    "punpckhbw    %%mm5, %%mm1  \n\t"\
699
\
700
    "psllq           $2, %%mm3  \n\t"\
701
    "psllq           $2, %%mm4  \n\t"\
702
\
703
    "por          %%mm3, %%mm2  \n\t"\
704
    "por          %%mm4, %%mm1  \n\t"\
705
\
706
    MOVNTQ(%%mm2,  (dst, index, 2))\
707
    MOVNTQ(%%mm1, 8(dst, index, 2))\
708
\
709
    "add             $8, "#index"   \n\t"\
710
    "cmp        "#dstw", "#index"   \n\t"\
711
    " jb             1b             \n\t"
712
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
713

    
714
#define WRITEBGR24OLD(dst, dstw, index) \
715
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716
    "movq      %%mm2, %%mm1             \n\t" /* B */\
717
    "movq      %%mm5, %%mm6             \n\t" /* R */\
718
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
719
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
720
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
721
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
722
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
723
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
724
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
725
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
726
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
727
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
728
\
729
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
730
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
731
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
732
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
733
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
734
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
735
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
736
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
737
\
738
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
739
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
740
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
741
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
742
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
743
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
744
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
745
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
746
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
747
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
748
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
749
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
750
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
751
\
752
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
753
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
754
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
755
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
756
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
757
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
758
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
759
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
760
\
761
    MOVNTQ(%%mm0,   (dst))\
762
    MOVNTQ(%%mm2,  8(dst))\
763
    MOVNTQ(%%mm3, 16(dst))\
764
    "add         $24, "#dst"            \n\t"\
765
\
766
    "add          $8, "#index"          \n\t"\
767
    "cmp     "#dstw", "#index"          \n\t"\
768
    " jb          1b                    \n\t"
769

    
770
#define WRITEBGR24MMX(dst, dstw, index) \
771
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772
    "movq      %%mm2, %%mm1     \n\t" /* B */\
773
    "movq      %%mm5, %%mm6     \n\t" /* R */\
774
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
775
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
776
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
777
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
778
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
779
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
780
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
781
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
782
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
783
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
784
\
785
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
786
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
787
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
788
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
789
\
790
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
791
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
792
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
793
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
794
\
795
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
796
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
797
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
798
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
799
\
800
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
801
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
802
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
803
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
804
    MOVNTQ(%%mm0, (dst))\
805
\
806
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
807
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
808
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
809
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
810
    MOVNTQ(%%mm6, 8(dst))\
811
\
812
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
813
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
814
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
815
    MOVNTQ(%%mm5, 16(dst))\
816
\
817
    "add         $24, "#dst"    \n\t"\
818
\
819
    "add          $8, "#index"  \n\t"\
820
    "cmp     "#dstw", "#index"  \n\t"\
821
    " jb          1b            \n\t"
822

    
823
#define WRITEBGR24MMX2(dst, dstw, index) \
824
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
828
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
829
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
830
\
831
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
832
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
833
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
834
\
835
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
836
    "por    %%mm1, %%mm6        \n\t"\
837
    "por    %%mm3, %%mm6        \n\t"\
838
    MOVNTQ(%%mm6, (dst))\
839
\
840
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
841
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
842
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
843
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
844
\
845
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
846
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
847
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
848
\
849
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
850
    "por    %%mm3, %%mm6        \n\t"\
851
    MOVNTQ(%%mm6, 8(dst))\
852
\
853
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
854
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
855
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
856
\
857
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
858
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
859
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
860
\
861
    "por    %%mm1, %%mm3        \n\t"\
862
    "por    %%mm3, %%mm6        \n\t"\
863
    MOVNTQ(%%mm6, 16(dst))\
864
\
865
    "add      $24, "#dst"       \n\t"\
866
\
867
    "add       $8, "#index"     \n\t"\
868
    "cmp  "#dstw", "#index"     \n\t"\
869
    " jb       1b               \n\t"
870

    
871
#if COMPILE_TEMPLATE_MMX2
872
#undef WRITEBGR24
873
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
874
#else
875
#undef WRITEBGR24
876
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
877
#endif
878

    
879
#define REAL_WRITEYUY2(dst, dstw, index) \
880
    "packuswb  %%mm3, %%mm3     \n\t"\
881
    "packuswb  %%mm4, %%mm4     \n\t"\
882
    "packuswb  %%mm7, %%mm1     \n\t"\
883
    "punpcklbw %%mm4, %%mm3     \n\t"\
884
    "movq      %%mm1, %%mm7     \n\t"\
885
    "punpcklbw %%mm3, %%mm1     \n\t"\
886
    "punpckhbw %%mm3, %%mm7     \n\t"\
887
\
888
    MOVNTQ(%%mm1, (dst, index, 2))\
889
    MOVNTQ(%%mm7, 8(dst, index, 2))\
890
\
891
    "add          $8, "#index"  \n\t"\
892
    "cmp     "#dstw", "#index"  \n\t"\
893
    " jb          1b            \n\t"
894
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
895

    
896

    
897
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
900
{
901
#if COMPILE_TEMPLATE_MMX
902
    if(!(c->flags & SWS_BITEXACT)) {
903
        if (c->flags & SWS_ACCURATE_RND) {
904
            if (uDest) {
905
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
907
            }
908
            if (CONFIG_SWSCALE_ALPHA && aDest) {
909
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
910
            }
911

    
912
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913
        } else {
914
            if (uDest) {
915
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
917
            }
918
            if (CONFIG_SWSCALE_ALPHA && aDest) {
919
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
920
            }
921

    
922
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923
        }
924
        return;
925
    }
926
#endif
927
#if COMPILE_TEMPLATE_ALTIVEC
928
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929
                          chrFilter, chrSrc, chrFilterSize,
930
                          dest, uDest, vDest, dstW, chrDstW);
931
#else //COMPILE_TEMPLATE_ALTIVEC
932
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933
                chrFilter, chrSrc, chrFilterSize,
934
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935
#endif //!COMPILE_TEMPLATE_ALTIVEC
936
}
937

    
938
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
941
{
942
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943
                 chrFilter, chrSrc, chrFilterSize,
944
                 dest, uDest, dstW, chrDstW, dstFormat);
945
}
946

    
947
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
949
{
950
    int i;
951
#if COMPILE_TEMPLATE_MMX
952
    if(!(c->flags & SWS_BITEXACT)) {
953
        long p= 4;
954
        const int16_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
957

    
958
        if (c->flags & SWS_ACCURATE_RND) {
959
            while(p--) {
960
                if (dst[p]) {
961
                    __asm__ volatile(
962
                        YSCALEYUV2YV121_ACCURATE
963
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
964
                        "g" (-counter[p])
965
                        : "%"REG_a
966
                    );
967
                }
968
            }
969
        } else {
970
            while(p--) {
971
                if (dst[p]) {
972
                    __asm__ volatile(
973
                        YSCALEYUV2YV121
974
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
975
                        "g" (-counter[p])
976
                        : "%"REG_a
977
                    );
978
                }
979
            }
980
        }
981
        return;
982
    }
983
#endif
984
    for (i=0; i<dstW; i++) {
985
        int val= (lumSrc[i]+64)>>7;
986

    
987
        if (val&256) {
988
            if (val<0) val=0;
989
            else       val=255;
990
        }
991

    
992
        dest[i]= val;
993
    }
994

    
995
    if (uDest)
996
        for (i=0; i<chrDstW; i++) {
997
            int u=(chrSrc[i       ]+64)>>7;
998
            int v=(chrSrc[i + VOFW]+64)>>7;
999

    
1000
            if ((u|v)&256) {
1001
                if (u<0)        u=0;
1002
                else if (u>255) u=255;
1003
                if (v<0)        v=0;
1004
                else if (v>255) v=255;
1005
            }
1006

    
1007
            uDest[i]= u;
1008
            vDest[i]= v;
1009
        }
1010

    
1011
    if (CONFIG_SWSCALE_ALPHA && aDest)
1012
        for (i=0; i<dstW; i++) {
1013
            int val= (alpSrc[i]+64)>>7;
1014
            aDest[i]= av_clip_uint8(val);
1015
        }
1016
}
1017

    
1018

    
1019
/**
1020
 * vertical scale YV12 to RGB
1021
 */
1022
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1025
{
1026
#if COMPILE_TEMPLATE_MMX
1027
    x86_reg dummy=0;
1028
    x86_reg dstW_reg = dstW;
1029
    if(!(c->flags & SWS_BITEXACT)) {
1030
        if (c->flags & SWS_ACCURATE_RND) {
1031
            switch(c->dstFormat) {
1032
            case PIX_FMT_RGB32:
1033
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034
                    YSCALEYUV2PACKEDX_ACCURATE
1035
                    YSCALEYUV2RGBX
1036
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1037
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1038
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1039
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1041
                    "psraw                        $3, %%mm1         \n\t"
1042
                    "psraw                        $3, %%mm7         \n\t"
1043
                    "packuswb                  %%mm7, %%mm1         \n\t"
1044
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1045

    
1046
                    YSCALEYUV2PACKEDX_END
1047
                } else {
1048
                    YSCALEYUV2PACKEDX_ACCURATE
1049
                    YSCALEYUV2RGBX
1050
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1051
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1052

    
1053
                    YSCALEYUV2PACKEDX_END
1054
                }
1055
                return;
1056
            case PIX_FMT_BGR24:
1057
                YSCALEYUV2PACKEDX_ACCURATE
1058
                YSCALEYUV2RGBX
1059
                "pxor %%mm7, %%mm7 \n\t"
1060
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061
                "add %4, %%"REG_c"                        \n\t"
1062
                WRITEBGR24(%%REGc, %5, %%REGa)
1063

    
1064

    
1065
                :: "r" (&c->redDither),
1066
                "m" (dummy), "m" (dummy), "m" (dummy),
1067
                "r" (dest), "m" (dstW_reg)
1068
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1069
                );
1070
                return;
1071
            case PIX_FMT_RGB555:
1072
                YSCALEYUV2PACKEDX_ACCURATE
1073
                YSCALEYUV2RGBX
1074
                "pxor %%mm7, %%mm7 \n\t"
1075
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076
#ifdef DITHER1XBPP
1077
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1080
#endif
1081

    
1082
                WRITERGB15(%4, %5, %%REGa)
1083
                YSCALEYUV2PACKEDX_END
1084
                return;
1085
            case PIX_FMT_RGB565:
1086
                YSCALEYUV2PACKEDX_ACCURATE
1087
                YSCALEYUV2RGBX
1088
                "pxor %%mm7, %%mm7 \n\t"
1089
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090
#ifdef DITHER1XBPP
1091
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1094
#endif
1095

    
1096
                WRITERGB16(%4, %5, %%REGa)
1097
                YSCALEYUV2PACKEDX_END
1098
                return;
1099
            case PIX_FMT_YUYV422:
1100
                YSCALEYUV2PACKEDX_ACCURATE
1101
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102

    
1103
                "psraw $3, %%mm3    \n\t"
1104
                "psraw $3, %%mm4    \n\t"
1105
                "psraw $3, %%mm1    \n\t"
1106
                "psraw $3, %%mm7    \n\t"
1107
                WRITEYUY2(%4, %5, %%REGa)
1108
                YSCALEYUV2PACKEDX_END
1109
                return;
1110
            }
1111
        } else {
1112
            switch(c->dstFormat) {
1113
            case PIX_FMT_RGB32:
1114
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1115
                    YSCALEYUV2PACKEDX
1116
                    YSCALEYUV2RGBX
1117
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118
                    "psraw                        $3, %%mm1         \n\t"
1119
                    "psraw                        $3, %%mm7         \n\t"
1120
                    "packuswb                  %%mm7, %%mm1         \n\t"
1121
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122
                    YSCALEYUV2PACKEDX_END
1123
                } else {
1124
                    YSCALEYUV2PACKEDX
1125
                    YSCALEYUV2RGBX
1126
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1127
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128
                    YSCALEYUV2PACKEDX_END
1129
                }
1130
                return;
1131
            case PIX_FMT_BGR24:
1132
                YSCALEYUV2PACKEDX
1133
                YSCALEYUV2RGBX
1134
                "pxor                    %%mm7, %%mm7       \n\t"
1135
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
                "add                        %4, %%"REG_c"   \n\t"
1137
                WRITEBGR24(%%REGc, %5, %%REGa)
1138

    
1139
                :: "r" (&c->redDither),
1140
                "m" (dummy), "m" (dummy), "m" (dummy),
1141
                "r" (dest),  "m" (dstW_reg)
1142
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
                );
1144
                return;
1145
            case PIX_FMT_RGB555:
1146
                YSCALEYUV2PACKEDX
1147
                YSCALEYUV2RGBX
1148
                "pxor %%mm7, %%mm7 \n\t"
1149
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150
#ifdef DITHER1XBPP
1151
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1152
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1153
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1154
#endif
1155

    
1156
                WRITERGB15(%4, %5, %%REGa)
1157
                YSCALEYUV2PACKEDX_END
1158
                return;
1159
            case PIX_FMT_RGB565:
1160
                YSCALEYUV2PACKEDX
1161
                YSCALEYUV2RGBX
1162
                "pxor %%mm7, %%mm7 \n\t"
1163
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164
#ifdef DITHER1XBPP
1165
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1166
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1167
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1168
#endif
1169

    
1170
                WRITERGB16(%4, %5, %%REGa)
1171
                YSCALEYUV2PACKEDX_END
1172
                return;
1173
            case PIX_FMT_YUYV422:
1174
                YSCALEYUV2PACKEDX
1175
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1176

    
1177
                "psraw $3, %%mm3    \n\t"
1178
                "psraw $3, %%mm4    \n\t"
1179
                "psraw $3, %%mm1    \n\t"
1180
                "psraw $3, %%mm7    \n\t"
1181
                WRITEYUY2(%4, %5, %%REGa)
1182
                YSCALEYUV2PACKEDX_END
1183
                return;
1184
            }
1185
        }
1186
    }
1187
#endif /* COMPILE_TEMPLATE_MMX */
1188
#if COMPILE_TEMPLATE_ALTIVEC
1189
    /* The following list of supported dstFormat values should
1190
       match what's found in the body of ff_yuv2packedX_altivec() */
1191
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1193
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1195
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196
                                   chrFilter, chrSrc, chrFilterSize,
1197
                                   dest, dstW, dstY);
1198
    else
1199
#endif
1200
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201
                       chrFilter, chrSrc, chrFilterSize,
1202
                       alpSrc, dest, dstW, dstY);
1203
}
1204

    
1205
/**
1206
 * vertical bilinear scale YV12 to RGB
1207
 */
1208
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1210
{
1211
    int  yalpha1=4095- yalpha;
1212
    int uvalpha1=4095-uvalpha;
1213
    int i;
1214

    
1215
#if COMPILE_TEMPLATE_MMX
1216
    if(!(c->flags & SWS_BITEXACT)) {
1217
        switch(c->dstFormat) {
1218
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219
        case PIX_FMT_RGB32:
1220
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1221
#if ARCH_X86_64
1222
                __asm__ volatile(
1223
                    YSCALEYUV2RGB(%%r8, %5)
1224
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227
                    "packuswb            %%mm7, %%mm1       \n\t"
1228
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1229

    
1230
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1231
                    "a" (&c->redDither)
1232
                    ,"r" (abuf0), "r" (abuf1)
1233
                    : "%r8"
1234
                );
1235
#else
1236
                c->u_temp=(intptr_t)abuf0;
1237
                c->v_temp=(intptr_t)abuf1;
1238
                __asm__ volatile(
1239
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1240
                    "mov        %4, %%"REG_b"               \n\t"
1241
                    "push %%"REG_BP"                        \n\t"
1242
                    YSCALEYUV2RGB(%%REGBP, %5)
1243
                    "push                   %0              \n\t"
1244
                    "push                   %1              \n\t"
1245
                    "mov          "U_TEMP"(%5), %0          \n\t"
1246
                    "mov          "V_TEMP"(%5), %1          \n\t"
1247
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250
                    "packuswb            %%mm7, %%mm1       \n\t"
1251
                    "pop                    %1              \n\t"
1252
                    "pop                    %0              \n\t"
1253
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254
                    "pop %%"REG_BP"                         \n\t"
1255
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1256

    
1257
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1258
                    "a" (&c->redDither)
1259
                );
1260
#endif
1261
            } else {
1262
                __asm__ volatile(
1263
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1264
                    "mov        %4, %%"REG_b"               \n\t"
1265
                    "push %%"REG_BP"                        \n\t"
1266
                    YSCALEYUV2RGB(%%REGBP, %5)
1267
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1268
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269
                    "pop %%"REG_BP"                         \n\t"
1270
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1271

    
1272
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273
                    "a" (&c->redDither)
1274
                );
1275
            }
1276
            return;
1277
        case PIX_FMT_BGR24:
1278
            __asm__ volatile(
1279
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1280
                "mov        %4, %%"REG_b"               \n\t"
1281
                "push %%"REG_BP"                        \n\t"
1282
                YSCALEYUV2RGB(%%REGBP, %5)
1283
                "pxor    %%mm7, %%mm7                   \n\t"
1284
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285
                "pop %%"REG_BP"                         \n\t"
1286
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1287
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1288
                "a" (&c->redDither)
1289
            );
1290
            return;
1291
        case PIX_FMT_RGB555:
1292
            __asm__ volatile(
1293
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1294
                "mov        %4, %%"REG_b"               \n\t"
1295
                "push %%"REG_BP"                        \n\t"
1296
                YSCALEYUV2RGB(%%REGBP, %5)
1297
                "pxor    %%mm7, %%mm7                   \n\t"
1298
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299
#ifdef DITHER1XBPP
1300
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1301
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1302
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1303
#endif
1304

    
1305
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306
                "pop %%"REG_BP"                         \n\t"
1307
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1308

    
1309
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1310
                "a" (&c->redDither)
1311
            );
1312
            return;
1313
        case PIX_FMT_RGB565:
1314
            __asm__ volatile(
1315
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1316
                "mov        %4, %%"REG_b"               \n\t"
1317
                "push %%"REG_BP"                        \n\t"
1318
                YSCALEYUV2RGB(%%REGBP, %5)
1319
                "pxor    %%mm7, %%mm7                   \n\t"
1320
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321
#ifdef DITHER1XBPP
1322
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1323
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1324
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1325
#endif
1326

    
1327
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328
                "pop %%"REG_BP"                         \n\t"
1329
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1330
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1331
                "a" (&c->redDither)
1332
            );
1333
            return;
1334
        case PIX_FMT_YUYV422:
1335
            __asm__ volatile(
1336
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1337
                "mov %4, %%"REG_b"                        \n\t"
1338
                "push %%"REG_BP"                        \n\t"
1339
                YSCALEYUV2PACKED(%%REGBP, %5)
1340
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341
                "pop %%"REG_BP"                         \n\t"
1342
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1343
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344
                "a" (&c->redDither)
1345
            );
1346
            return;
1347
        default: break;
1348
        }
1349
    }
1350
#endif //COMPILE_TEMPLATE_MMX
1351
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1352
}
1353

    
1354
/**
1355
 * YV12 to RGB without scaling or interpolating
1356
 */
1357
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1359
{
1360
    const int yalpha1=0;
1361
    int i;
1362

    
1363
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1364
    const int yalpha= 4096; //FIXME ...
1365

    
1366
    if (flags&SWS_FULL_CHR_H_INT) {
1367
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1368
        return;
1369
    }
1370

    
1371
#if COMPILE_TEMPLATE_MMX
1372
    if(!(flags & SWS_BITEXACT)) {
1373
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1374
            switch(dstFormat) {
1375
            case PIX_FMT_RGB32:
1376
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1377
                    __asm__ volatile(
1378
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1379
                        "mov        %4, %%"REG_b"               \n\t"
1380
                        "push %%"REG_BP"                        \n\t"
1381
                        YSCALEYUV2RGB1(%%REGBP, %5)
1382
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384
                        "pop %%"REG_BP"                         \n\t"
1385
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1386

    
1387
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1388
                        "a" (&c->redDither)
1389
                    );
1390
                } else {
1391
                    __asm__ volatile(
1392
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1393
                        "mov        %4, %%"REG_b"               \n\t"
1394
                        "push %%"REG_BP"                        \n\t"
1395
                        YSCALEYUV2RGB1(%%REGBP, %5)
1396
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1397
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398
                        "pop %%"REG_BP"                         \n\t"
1399
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1400

    
1401
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1402
                        "a" (&c->redDither)
1403
                    );
1404
                }
1405
                return;
1406
            case PIX_FMT_BGR24:
1407
                __asm__ volatile(
1408
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409
                    "mov        %4, %%"REG_b"               \n\t"
1410
                    "push %%"REG_BP"                        \n\t"
1411
                    YSCALEYUV2RGB1(%%REGBP, %5)
1412
                    "pxor    %%mm7, %%mm7                   \n\t"
1413
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414
                    "pop %%"REG_BP"                         \n\t"
1415
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1416

    
1417
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1418
                    "a" (&c->redDither)
1419
                );
1420
                return;
1421
            case PIX_FMT_RGB555:
1422
                __asm__ volatile(
1423
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1424
                    "mov        %4, %%"REG_b"               \n\t"
1425
                    "push %%"REG_BP"                        \n\t"
1426
                    YSCALEYUV2RGB1(%%REGBP, %5)
1427
                    "pxor    %%mm7, %%mm7                   \n\t"
1428
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429
#ifdef DITHER1XBPP
1430
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1431
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1432
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1433
#endif
1434
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435
                    "pop %%"REG_BP"                         \n\t"
1436
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1437

    
1438
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1439
                    "a" (&c->redDither)
1440
                );
1441
                return;
1442
            case PIX_FMT_RGB565:
1443
                __asm__ volatile(
1444
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1445
                    "mov        %4, %%"REG_b"               \n\t"
1446
                    "push %%"REG_BP"                        \n\t"
1447
                    YSCALEYUV2RGB1(%%REGBP, %5)
1448
                    "pxor    %%mm7, %%mm7                   \n\t"
1449
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1450
#ifdef DITHER1XBPP
1451
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1452
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1453
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1454
#endif
1455

    
1456
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457
                    "pop %%"REG_BP"                         \n\t"
1458
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459

    
1460
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1461
                    "a" (&c->redDither)
1462
                );
1463
                return;
1464
            case PIX_FMT_YUYV422:
1465
                __asm__ volatile(
1466
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1467
                    "mov        %4, %%"REG_b"               \n\t"
1468
                    "push %%"REG_BP"                        \n\t"
1469
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1470
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471
                    "pop %%"REG_BP"                         \n\t"
1472
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473

    
1474
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                    "a" (&c->redDither)
1476
                );
1477
                return;
1478
            }
1479
        } else {
1480
            switch(dstFormat) {
1481
            case PIX_FMT_RGB32:
1482
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1483
                    __asm__ volatile(
1484
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1485
                        "mov        %4, %%"REG_b"               \n\t"
1486
                        "push %%"REG_BP"                        \n\t"
1487
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1488
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490
                        "pop %%"REG_BP"                         \n\t"
1491
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1492

    
1493
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494
                        "a" (&c->redDither)
1495
                    );
1496
                } else {
1497
                    __asm__ volatile(
1498
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1499
                        "mov        %4, %%"REG_b"               \n\t"
1500
                        "push %%"REG_BP"                        \n\t"
1501
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1502
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1503
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504
                        "pop %%"REG_BP"                         \n\t"
1505
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1506

    
1507
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508
                        "a" (&c->redDither)
1509
                    );
1510
                }
1511
                return;
1512
            case PIX_FMT_BGR24:
1513
                __asm__ volatile(
1514
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515
                    "mov        %4, %%"REG_b"               \n\t"
1516
                    "push %%"REG_BP"                        \n\t"
1517
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1518
                    "pxor    %%mm7, %%mm7                   \n\t"
1519
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1520
                    "pop %%"REG_BP"                         \n\t"
1521
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522

    
1523
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524
                    "a" (&c->redDither)
1525
                );
1526
                return;
1527
            case PIX_FMT_RGB555:
1528
                __asm__ volatile(
1529
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1530
                    "mov        %4, %%"REG_b"               \n\t"
1531
                    "push %%"REG_BP"                        \n\t"
1532
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1533
                    "pxor    %%mm7, %%mm7                   \n\t"
1534
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1535
#ifdef DITHER1XBPP
1536
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1537
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1538
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1539
#endif
1540
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1541
                    "pop %%"REG_BP"                         \n\t"
1542
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543

    
1544
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545
                    "a" (&c->redDither)
1546
                );
1547
                return;
1548
            case PIX_FMT_RGB565:
1549
                __asm__ volatile(
1550
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
                    "mov        %4, %%"REG_b"               \n\t"
1552
                    "push %%"REG_BP"                        \n\t"
1553
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1554
                    "pxor    %%mm7, %%mm7                   \n\t"
1555
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556
#ifdef DITHER1XBPP
1557
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1558
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1559
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1560
#endif
1561

    
1562
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1563
                    "pop %%"REG_BP"                         \n\t"
1564
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1565

    
1566
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1567
                    "a" (&c->redDither)
1568
                );
1569
                return;
1570
            case PIX_FMT_YUYV422:
1571
                __asm__ volatile(
1572
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1573
                    "mov        %4, %%"REG_b"               \n\t"
1574
                    "push %%"REG_BP"                        \n\t"
1575
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1576
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1577
                    "pop %%"REG_BP"                         \n\t"
1578
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1579

    
1580
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1581
                    "a" (&c->redDither)
1582
                );
1583
                return;
1584
            }
1585
        }
1586
    }
1587
#endif /* COMPILE_TEMPLATE_MMX */
1588
    if (uvalpha < 2048) {
1589
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1590
    } else {
1591
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592
    }
1593
}
1594

    
1595
//FIXME yuy2* can read up to 7 samples too much
1596

    
1597
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1598
{
1599
#if COMPILE_TEMPLATE_MMX
1600
    __asm__ volatile(
1601
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1602
        "mov                    %0, %%"REG_a"       \n\t"
1603
        "1:                                         \n\t"
1604
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1605
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1606
        "pand                %%mm2, %%mm0           \n\t"
1607
        "pand                %%mm2, %%mm1           \n\t"
1608
        "packuswb            %%mm1, %%mm0           \n\t"
1609
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1610
        "add                    $8, %%"REG_a"       \n\t"
1611
        " js                    1b                  \n\t"
1612
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1613
        : "%"REG_a
1614
    );
1615
#else
1616
    int i;
1617
    for (i=0; i<width; i++)
1618
        dst[i]= src[2*i];
1619
#endif
1620
}
1621

    
1622
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1623
{
1624
#if COMPILE_TEMPLATE_MMX
1625
    __asm__ volatile(
1626
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1627
        "mov                    %0, %%"REG_a"       \n\t"
1628
        "1:                                         \n\t"
1629
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1630
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1631
        "psrlw                  $8, %%mm0           \n\t"
1632
        "psrlw                  $8, %%mm1           \n\t"
1633
        "packuswb            %%mm1, %%mm0           \n\t"
1634
        "movq                %%mm0, %%mm1           \n\t"
1635
        "psrlw                  $8, %%mm0           \n\t"
1636
        "pand                %%mm4, %%mm1           \n\t"
1637
        "packuswb            %%mm0, %%mm0           \n\t"
1638
        "packuswb            %%mm1, %%mm1           \n\t"
1639
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1640
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1641
        "add                    $4, %%"REG_a"       \n\t"
1642
        " js                    1b                  \n\t"
1643
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1644
        : "%"REG_a
1645
    );
1646
#else
1647
    int i;
1648
    for (i=0; i<width; i++) {
1649
        dstU[i]= src1[4*i + 1];
1650
        dstV[i]= src1[4*i + 3];
1651
    }
1652
#endif
1653
    assert(src1 == src2);
1654
}
1655

    
1656
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1657
{
1658
#if COMPILE_TEMPLATE_MMX
1659
    __asm__ volatile(
1660
        "mov                    %0, %%"REG_a"       \n\t"
1661
        "1:                                         \n\t"
1662
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1663
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1664
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1665
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1666
        "psrlw                  $8, %%mm0           \n\t"
1667
        "psrlw                  $8, %%mm1           \n\t"
1668
        "psrlw                  $8, %%mm2           \n\t"
1669
        "psrlw                  $8, %%mm3           \n\t"
1670
        "packuswb            %%mm1, %%mm0           \n\t"
1671
        "packuswb            %%mm3, %%mm2           \n\t"
1672
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1673
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1674
        "add                    $8, %%"REG_a"       \n\t"
1675
        " js                    1b                  \n\t"
1676
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1677
        : "%"REG_a
1678
    );
1679
#else
1680
    int i;
1681
    for (i=0; i<width; i++) {
1682
        dstU[i]= src1[2*i + 1];
1683
        dstV[i]= src2[2*i + 1];
1684
    }
1685
#endif
1686
}
1687

    
1688
/* This is almost identical to the previous, end exists only because
1689
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1691
{
1692
#if COMPILE_TEMPLATE_MMX
1693
    __asm__ volatile(
1694
        "mov                  %0, %%"REG_a"         \n\t"
1695
        "1:                                         \n\t"
1696
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1697
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1698
        "psrlw                $8, %%mm0             \n\t"
1699
        "psrlw                $8, %%mm1             \n\t"
1700
        "packuswb          %%mm1, %%mm0             \n\t"
1701
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1702
        "add                  $8, %%"REG_a"         \n\t"
1703
        " js                  1b                    \n\t"
1704
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1705
        : "%"REG_a
1706
    );
1707
#else
1708
    int i;
1709
    for (i=0; i<width; i++)
1710
        dst[i]= src[2*i+1];
1711
#endif
1712
}
1713

    
1714
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1715
{
1716
#if COMPILE_TEMPLATE_MMX
1717
    __asm__ volatile(
1718
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1719
        "mov                    %0, %%"REG_a"       \n\t"
1720
        "1:                                         \n\t"
1721
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1722
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1723
        "pand                %%mm4, %%mm0           \n\t"
1724
        "pand                %%mm4, %%mm1           \n\t"
1725
        "packuswb            %%mm1, %%mm0           \n\t"
1726
        "movq                %%mm0, %%mm1           \n\t"
1727
        "psrlw                  $8, %%mm0           \n\t"
1728
        "pand                %%mm4, %%mm1           \n\t"
1729
        "packuswb            %%mm0, %%mm0           \n\t"
1730
        "packuswb            %%mm1, %%mm1           \n\t"
1731
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1732
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1733
        "add                    $4, %%"REG_a"       \n\t"
1734
        " js                    1b                  \n\t"
1735
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1736
        : "%"REG_a
1737
    );
1738
#else
1739
    int i;
1740
    for (i=0; i<width; i++) {
1741
        dstU[i]= src1[4*i + 0];
1742
        dstV[i]= src1[4*i + 2];
1743
    }
1744
#endif
1745
    assert(src1 == src2);
1746
}
1747

    
1748
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1749
{
1750
#if COMPILE_TEMPLATE_MMX
1751
    __asm__ volatile(
1752
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1753
        "mov                    %0, %%"REG_a"       \n\t"
1754
        "1:                                         \n\t"
1755
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1756
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1757
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1758
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1759
        "pand                %%mm4, %%mm0           \n\t"
1760
        "pand                %%mm4, %%mm1           \n\t"
1761
        "pand                %%mm4, %%mm2           \n\t"
1762
        "pand                %%mm4, %%mm3           \n\t"
1763
        "packuswb            %%mm1, %%mm0           \n\t"
1764
        "packuswb            %%mm3, %%mm2           \n\t"
1765
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1766
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1767
        "add                    $8, %%"REG_a"       \n\t"
1768
        " js                    1b                  \n\t"
1769
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1770
        : "%"REG_a
1771
    );
1772
#else
1773
    int i;
1774
    for (i=0; i<width; i++) {
1775
        dstU[i]= src1[2*i];
1776
        dstV[i]= src2[2*i];
1777
    }
1778
#endif
1779
}
1780

    
1781
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1782
                                    const uint8_t *src, long width)
1783
{
1784
#if COMPILE_TEMPLATE_MMX
1785
    __asm__ volatile(
1786
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1787
        "mov                    %0, %%"REG_a"       \n\t"
1788
        "1:                                         \n\t"
1789
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1790
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1791
        "movq                %%mm0, %%mm2           \n\t"
1792
        "movq                %%mm1, %%mm3           \n\t"
1793
        "pand                %%mm4, %%mm0           \n\t"
1794
        "pand                %%mm4, %%mm1           \n\t"
1795
        "psrlw                  $8, %%mm2           \n\t"
1796
        "psrlw                  $8, %%mm3           \n\t"
1797
        "packuswb            %%mm1, %%mm0           \n\t"
1798
        "packuswb            %%mm3, %%mm2           \n\t"
1799
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1800
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1801
        "add                    $8, %%"REG_a"       \n\t"
1802
        " js                    1b                  \n\t"
1803
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1804
        : "%"REG_a
1805
    );
1806
#else
1807
    int i;
1808
    for (i = 0; i < width; i++) {
1809
        dst1[i] = src[2*i+0];
1810
        dst2[i] = src[2*i+1];
1811
    }
1812
#endif
1813
}
1814

    
1815
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1816
                                    const uint8_t *src1, const uint8_t *src2,
1817
                                    long width, uint32_t *unused)
1818
{
1819
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1820
}
1821

    
1822
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1823
                                    const uint8_t *src1, const uint8_t *src2,
1824
                                    long width, uint32_t *unused)
1825
{
1826
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1827
}
1828

    
1829
// FIXME Maybe dither instead.
1830
#define YUV_NBPS(depth) \
1831
static inline void RENAME(yuv ## depth ## ToUV)(uint8_t *dstU, uint8_t *dstV, \
1832
                                     const uint16_t *srcU, const uint16_t *srcV, \
1833
                                     long width, uint32_t *unused) \
1834
{ \
1835
    int i; \
1836
    for (i = 0; i < width; i++) { \
1837
        dstU[i] = srcU[i]>>(depth-8); \
1838
        dstV[i] = srcV[i]>>(depth-8); \
1839
    } \
1840
} \
1841
\
1842
static inline void RENAME(yuv ## depth ## ToY)(uint8_t *dstY, const uint16_t *srcY, long width, uint32_t *unused) \
1843
{ \
1844
    int i; \
1845
    for (i = 0; i < width; i++) \
1846
        dstY[i] = srcY[i]>>(depth-8); \
1847
} \
1848

    
1849
YUV_NBPS( 9)
1850
YUV_NBPS(10)
1851

    
1852
#if COMPILE_TEMPLATE_MMX
1853
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1854
{
1855

    
1856
    if(srcFormat == PIX_FMT_BGR24) {
1857
        __asm__ volatile(
1858
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1859
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1860
            :
1861
        );
1862
    } else {
1863
        __asm__ volatile(
1864
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1865
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1866
            :
1867
        );
1868
    }
1869

    
1870
    __asm__ volatile(
1871
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1872
        "mov                        %2, %%"REG_a"   \n\t"
1873
        "pxor                    %%mm7, %%mm7       \n\t"
1874
        "1:                                         \n\t"
1875
        PREFETCH"               64(%0)              \n\t"
1876
        "movd                     (%0), %%mm0       \n\t"
1877
        "movd                    2(%0), %%mm1       \n\t"
1878
        "movd                    6(%0), %%mm2       \n\t"
1879
        "movd                    8(%0), %%mm3       \n\t"
1880
        "add                       $12, %0          \n\t"
1881
        "punpcklbw               %%mm7, %%mm0       \n\t"
1882
        "punpcklbw               %%mm7, %%mm1       \n\t"
1883
        "punpcklbw               %%mm7, %%mm2       \n\t"
1884
        "punpcklbw               %%mm7, %%mm3       \n\t"
1885
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1886
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1887
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1888
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1889
        "paddd                   %%mm1, %%mm0       \n\t"
1890
        "paddd                   %%mm3, %%mm2       \n\t"
1891
        "paddd                   %%mm4, %%mm0       \n\t"
1892
        "paddd                   %%mm4, %%mm2       \n\t"
1893
        "psrad                     $15, %%mm0       \n\t"
1894
        "psrad                     $15, %%mm2       \n\t"
1895
        "packssdw                %%mm2, %%mm0       \n\t"
1896
        "packuswb                %%mm0, %%mm0       \n\t"
1897
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1898
        "add                        $4, %%"REG_a"   \n\t"
1899
        " js                        1b              \n\t"
1900
    : "+r" (src)
1901
    : "r" (dst+width), "g" ((x86_reg)-width)
1902
    : "%"REG_a
1903
    );
1904
}
1905

    
1906
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1907
{
1908
    __asm__ volatile(
1909
        "movq                   24(%4), %%mm6       \n\t"
1910
        "mov                        %3, %%"REG_a"   \n\t"
1911
        "pxor                    %%mm7, %%mm7       \n\t"
1912
        "1:                                         \n\t"
1913
        PREFETCH"               64(%0)              \n\t"
1914
        "movd                     (%0), %%mm0       \n\t"
1915
        "movd                    2(%0), %%mm1       \n\t"
1916
        "punpcklbw               %%mm7, %%mm0       \n\t"
1917
        "punpcklbw               %%mm7, %%mm1       \n\t"
1918
        "movq                    %%mm0, %%mm2       \n\t"
1919
        "movq                    %%mm1, %%mm3       \n\t"
1920
        "pmaddwd                  (%4), %%mm0       \n\t"
1921
        "pmaddwd                 8(%4), %%mm1       \n\t"
1922
        "pmaddwd                16(%4), %%mm2       \n\t"
1923
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1924
        "paddd                   %%mm1, %%mm0       \n\t"
1925
        "paddd                   %%mm3, %%mm2       \n\t"
1926

    
1927
        "movd                    6(%0), %%mm1       \n\t"
1928
        "movd                    8(%0), %%mm3       \n\t"
1929
        "add                       $12, %0          \n\t"
1930
        "punpcklbw               %%mm7, %%mm1       \n\t"
1931
        "punpcklbw               %%mm7, %%mm3       \n\t"
1932
        "movq                    %%mm1, %%mm4       \n\t"
1933
        "movq                    %%mm3, %%mm5       \n\t"
1934
        "pmaddwd                  (%4), %%mm1       \n\t"
1935
        "pmaddwd                 8(%4), %%mm3       \n\t"
1936
        "pmaddwd                16(%4), %%mm4       \n\t"
1937
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1938
        "paddd                   %%mm3, %%mm1       \n\t"
1939
        "paddd                   %%mm5, %%mm4       \n\t"
1940

    
1941
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1942
        "paddd                   %%mm3, %%mm0       \n\t"
1943
        "paddd                   %%mm3, %%mm2       \n\t"
1944
        "paddd                   %%mm3, %%mm1       \n\t"
1945
        "paddd                   %%mm3, %%mm4       \n\t"
1946
        "psrad                     $15, %%mm0       \n\t"
1947
        "psrad                     $15, %%mm2       \n\t"
1948
        "psrad                     $15, %%mm1       \n\t"
1949
        "psrad                     $15, %%mm4       \n\t"
1950
        "packssdw                %%mm1, %%mm0       \n\t"
1951
        "packssdw                %%mm4, %%mm2       \n\t"
1952
        "packuswb                %%mm0, %%mm0       \n\t"
1953
        "packuswb                %%mm2, %%mm2       \n\t"
1954
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1955
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1956
        "add                        $4, %%"REG_a"   \n\t"
1957
        " js                        1b              \n\t"
1958
    : "+r" (src)
1959
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1960
    : "%"REG_a
1961
    );
1962
}
1963
#endif
1964

    
1965
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1966
{
1967
#if COMPILE_TEMPLATE_MMX
1968
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1969
#else
1970
    int i;
1971
    for (i=0; i<width; i++) {
1972
        int b= src[i*3+0];
1973
        int g= src[i*3+1];
1974
        int r= src[i*3+2];
1975

    
1976
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1977
    }
1978
#endif /* COMPILE_TEMPLATE_MMX */
1979
}
1980

    
1981
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1982
{
1983
#if COMPILE_TEMPLATE_MMX
1984
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1985
#else
1986
    int i;
1987
    for (i=0; i<width; i++) {
1988
        int b= src1[3*i + 0];
1989
        int g= src1[3*i + 1];
1990
        int r= src1[3*i + 2];
1991

    
1992
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1993
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1994
    }
1995
#endif /* COMPILE_TEMPLATE_MMX */
1996
    assert(src1 == src2);
1997
}
1998

    
1999
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2000
{
2001
    int i;
2002
    for (i=0; i<width; i++) {
2003
        int b= src1[6*i + 0] + src1[6*i + 3];
2004
        int g= src1[6*i + 1] + src1[6*i + 4];
2005
        int r= src1[6*i + 2] + src1[6*i + 5];
2006

    
2007
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2008
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2009
    }
2010
    assert(src1 == src2);
2011
}
2012

    
2013
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2014
{
2015
#if COMPILE_TEMPLATE_MMX
2016
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2017
#else
2018
    int i;
2019
    for (i=0; i<width; i++) {
2020
        int r= src[i*3+0];
2021
        int g= src[i*3+1];
2022
        int b= src[i*3+2];
2023

    
2024
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2025
    }
2026
#endif
2027
}
2028

    
2029
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2030
{
2031
#if COMPILE_TEMPLATE_MMX
2032
    assert(src1==src2);
2033
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2034
#else
2035
    int i;
2036
    assert(src1==src2);
2037
    for (i=0; i<width; i++) {
2038
        int r= src1[3*i + 0];
2039
        int g= src1[3*i + 1];
2040
        int b= src1[3*i + 2];
2041

    
2042
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2043
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2044
    }
2045
#endif
2046
}
2047

    
2048
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2049
{
2050
    int i;
2051
    assert(src1==src2);
2052
    for (i=0; i<width; i++) {
2053
        int r= src1[6*i + 0] + src1[6*i + 3];
2054
        int g= src1[6*i + 1] + src1[6*i + 4];
2055
        int b= src1[6*i + 2] + src1[6*i + 5];
2056

    
2057
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2058
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2059
    }
2060
}
2061

    
2062

    
2063
// bilinear / bicubic scaling
2064
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2065
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2066
{
2067
#if COMPILE_TEMPLATE_MMX
2068
    assert(filterSize % 4 == 0 && filterSize>0);
2069
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2070
        x86_reg counter= -2*dstW;
2071
        filter-= counter*2;
2072
        filterPos-= counter/2;
2073
        dst-= counter/2;
2074
        __asm__ volatile(
2075
#if defined(PIC)
2076
            "push            %%"REG_b"              \n\t"
2077
#endif
2078
            "pxor                %%mm7, %%mm7       \n\t"
2079
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2080
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2081
            ".p2align                4              \n\t"
2082
            "1:                                     \n\t"
2083
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2084
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2085
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2086
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2087
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2088
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2089
            "punpcklbw           %%mm7, %%mm0       \n\t"
2090
            "punpcklbw           %%mm7, %%mm2       \n\t"
2091
            "pmaddwd             %%mm1, %%mm0       \n\t"
2092
            "pmaddwd             %%mm2, %%mm3       \n\t"
2093
            "movq                %%mm0, %%mm4       \n\t"
2094
            "punpckldq           %%mm3, %%mm0       \n\t"
2095
            "punpckhdq           %%mm3, %%mm4       \n\t"
2096
            "paddd               %%mm4, %%mm0       \n\t"
2097
            "psrad                  $7, %%mm0       \n\t"
2098
            "packssdw            %%mm0, %%mm0       \n\t"
2099
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2100
            "add                    $4, %%"REG_BP"  \n\t"
2101
            " jnc                   1b              \n\t"
2102

    
2103
            "pop            %%"REG_BP"              \n\t"
2104
#if defined(PIC)
2105
            "pop             %%"REG_b"              \n\t"
2106
#endif
2107
            : "+a" (counter)
2108
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2109
#if !defined(PIC)
2110
            : "%"REG_b
2111
#endif
2112
        );
2113
    } else if (filterSize==8) {
2114
        x86_reg counter= -2*dstW;
2115
        filter-= counter*4;
2116
        filterPos-= counter/2;
2117
        dst-= counter/2;
2118
        __asm__ volatile(
2119
#if defined(PIC)
2120
            "push             %%"REG_b"             \n\t"
2121
#endif
2122
            "pxor                 %%mm7, %%mm7      \n\t"
2123
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2124
            "mov              %%"REG_a", %%"REG_BP" \n\t"
2125
            ".p2align                 4             \n\t"
2126
            "1:                                     \n\t"
2127
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2128
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2129
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2130
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2131
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2132
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2133
            "punpcklbw            %%mm7, %%mm0      \n\t"
2134
            "punpcklbw            %%mm7, %%mm2      \n\t"
2135
            "pmaddwd              %%mm1, %%mm0      \n\t"
2136
            "pmaddwd              %%mm2, %%mm3      \n\t"
2137

    
2138
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2139
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2140
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2141
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2142
            "punpcklbw            %%mm7, %%mm4      \n\t"
2143
            "punpcklbw            %%mm7, %%mm2      \n\t"
2144
            "pmaddwd              %%mm1, %%mm4      \n\t"
2145
            "pmaddwd              %%mm2, %%mm5      \n\t"
2146
            "paddd                %%mm4, %%mm0      \n\t"
2147
            "paddd                %%mm5, %%mm3      \n\t"
2148
            "movq                 %%mm0, %%mm4      \n\t"
2149
            "punpckldq            %%mm3, %%mm0      \n\t"
2150
            "punpckhdq            %%mm3, %%mm4      \n\t"
2151
            "paddd                %%mm4, %%mm0      \n\t"
2152
            "psrad                   $7, %%mm0      \n\t"
2153
            "packssdw             %%mm0, %%mm0      \n\t"
2154
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2155
            "add                     $4, %%"REG_BP" \n\t"
2156
            " jnc                    1b             \n\t"
2157

    
2158
            "pop             %%"REG_BP"             \n\t"
2159
#if defined(PIC)
2160
            "pop              %%"REG_b"             \n\t"
2161
#endif
2162
            : "+a" (counter)
2163
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2164
#if !defined(PIC)
2165
            : "%"REG_b
2166
#endif
2167
        );
2168
    } else {
2169
        const uint8_t *offset = src+filterSize;
2170
        x86_reg counter= -2*dstW;
2171
        //filter-= counter*filterSize/2;
2172
        filterPos-= counter/2;
2173
        dst-= counter/2;
2174
        __asm__ volatile(
2175
            "pxor                  %%mm7, %%mm7     \n\t"
2176
            ".p2align                  4            \n\t"
2177
            "1:                                     \n\t"
2178
            "mov                      %2, %%"REG_c" \n\t"
2179
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2180
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2181
            "mov                      %5, %%"REG_c" \n\t"
2182
            "pxor                  %%mm4, %%mm4     \n\t"
2183
            "pxor                  %%mm5, %%mm5     \n\t"
2184
            "2:                                     \n\t"
2185
            "movq                   (%1), %%mm1     \n\t"
2186
            "movq               (%1, %6), %%mm3     \n\t"
2187
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2188
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2189
            "punpcklbw             %%mm7, %%mm0     \n\t"
2190
            "punpcklbw             %%mm7, %%mm2     \n\t"
2191
            "pmaddwd               %%mm1, %%mm0     \n\t"
2192
            "pmaddwd               %%mm2, %%mm3     \n\t"
2193
            "paddd                 %%mm3, %%mm5     \n\t"
2194
            "paddd                 %%mm0, %%mm4     \n\t"
2195
            "add                      $8, %1        \n\t"
2196
            "add                      $4, %%"REG_c" \n\t"
2197
            "cmp                      %4, %%"REG_c" \n\t"
2198
            " jb                      2b            \n\t"
2199
            "add                      %6, %1        \n\t"
2200
            "movq                  %%mm4, %%mm0     \n\t"
2201
            "punpckldq             %%mm5, %%mm4     \n\t"
2202
            "punpckhdq             %%mm5, %%mm0     \n\t"
2203
            "paddd                 %%mm0, %%mm4     \n\t"
2204
            "psrad                    $7, %%mm4     \n\t"
2205
            "packssdw              %%mm4, %%mm4     \n\t"
2206
            "mov                      %3, %%"REG_a" \n\t"
2207
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2208
            "add                      $4, %0        \n\t"
2209
            " jnc                     1b            \n\t"
2210

    
2211
            : "+r" (counter), "+r" (filter)
2212
            : "m" (filterPos), "m" (dst), "m"(offset),
2213
            "m" (src), "r" ((x86_reg)filterSize*2)
2214
            : "%"REG_a, "%"REG_c, "%"REG_d
2215
        );
2216
    }
2217
#else
2218
#if COMPILE_TEMPLATE_ALTIVEC
2219
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2220
#else
2221
    int i;
2222
    for (i=0; i<dstW; i++) {
2223
        int j;
2224
        int srcPos= filterPos[i];
2225
        int val=0;
2226
        //printf("filterPos: %d\n", filterPos[i]);
2227
        for (j=0; j<filterSize; j++) {
2228
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2229
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2230
        }
2231
        //filter += hFilterSize;
2232
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2233
        //dst[i] = val>>7;
2234
    }
2235
#endif /* COMPILE_TEMPLATE_ALTIVEC */
2236
#endif /* COMPILE_MMX */
2237
}
2238

    
2239
//FIXME all pal and rgb srcFormats could do this convertion as well
2240
//FIXME all scalers more complex than bilinear could do half of this transform
2241
static void RENAME(chrRangeToJpeg)(int16_t *dst, int width)
2242
{
2243
    int i;
2244
    for (i = 0; i < width; i++) {
2245
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2246
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2247
    }
2248
}
2249
static void RENAME(chrRangeFromJpeg)(int16_t *dst, int width)
2250
{
2251
    int i;
2252
    for (i = 0; i < width; i++) {
2253
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2254
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2255
    }
2256
}
2257
static void RENAME(lumRangeToJpeg)(int16_t *dst, int width)
2258
{
2259
    int i;
2260
    for (i = 0; i < width; i++)
2261
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2262
}
2263
static void RENAME(lumRangeFromJpeg)(int16_t *dst, int width)
2264
{
2265
    int i;
2266
    for (i = 0; i < width; i++)
2267
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2268
}
2269

    
2270
#define FAST_BILINEAR_X86 \
2271
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2272
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2273
    "shll      $16, %%edi    \n\t"                                              \
2274
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2275
    "mov        %1, %%"REG_D"\n\t"                                              \
2276
    "shrl       $9, %%esi    \n\t"                                              \
2277

    
2278
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2279
                                        long dstWidth, const uint8_t *src, int srcW,
2280
                                        int xInc)
2281
{
2282
#if ARCH_X86
2283
#if COMPILE_TEMPLATE_MMX2
2284
    int32_t *filterPos = c->hLumFilterPos;
2285
    int16_t *filter    = c->hLumFilter;
2286
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2287
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2288
    int i;
2289
#if defined(PIC)
2290
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2291
#endif
2292
    if (canMMX2BeUsed) {
2293
        __asm__ volatile(
2294
#if defined(PIC)
2295
            "mov               %%"REG_b", %5        \n\t"
2296
#endif
2297
            "pxor                  %%mm7, %%mm7     \n\t"
2298
            "mov                      %0, %%"REG_c" \n\t"
2299
            "mov                      %1, %%"REG_D" \n\t"
2300
            "mov                      %2, %%"REG_d" \n\t"
2301
            "mov                      %3, %%"REG_b" \n\t"
2302
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2303
            PREFETCH"        (%%"REG_c")            \n\t"
2304
            PREFETCH"      32(%%"REG_c")            \n\t"
2305
            PREFETCH"      64(%%"REG_c")            \n\t"
2306

    
2307
#if ARCH_X86_64
2308

    
2309
#define CALL_MMX2_FILTER_CODE \
2310
            "movl            (%%"REG_b"), %%esi     \n\t"\
2311
            "call                    *%4            \n\t"\
2312
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2313
            "add               %%"REG_S", %%"REG_c" \n\t"\
2314
            "add               %%"REG_a", %%"REG_D" \n\t"\
2315
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2316

    
2317
#else
2318

    
2319
#define CALL_MMX2_FILTER_CODE \
2320
            "movl (%%"REG_b"), %%esi        \n\t"\
2321
            "call         *%4                       \n\t"\
2322
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2323
            "add               %%"REG_a", %%"REG_D" \n\t"\
2324
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2325

    
2326
#endif /* ARCH_X86_64 */
2327

    
2328
            CALL_MMX2_FILTER_CODE
2329
            CALL_MMX2_FILTER_CODE
2330
            CALL_MMX2_FILTER_CODE
2331
            CALL_MMX2_FILTER_CODE
2332
            CALL_MMX2_FILTER_CODE
2333
            CALL_MMX2_FILTER_CODE
2334
            CALL_MMX2_FILTER_CODE
2335
            CALL_MMX2_FILTER_CODE
2336

    
2337
#if defined(PIC)
2338
            "mov                      %5, %%"REG_b" \n\t"
2339
#endif
2340
            :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2341
            "m" (mmx2FilterCode)
2342
#if defined(PIC)
2343
            ,"m" (ebxsave)
2344
#endif
2345
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2346
#if !defined(PIC)
2347
            ,"%"REG_b
2348
#endif
2349
        );
2350
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2351
    } else {
2352
#endif /* COMPILE_TEMPLATE_MMX2 */
2353
    x86_reg xInc_shr16 = xInc >> 16;
2354
    uint16_t xInc_mask = xInc & 0xffff;
2355
    x86_reg dstWidth_reg = dstWidth;
2356
    //NO MMX just normal asm ...
2357
    __asm__ volatile(
2358
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2359
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2360
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2361
        ".p2align    4                       \n\t"
2362
        "1:                                  \n\t"
2363
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2364
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2365
        FAST_BILINEAR_X86
2366
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2367
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2368
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2369

    
2370
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2371
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2372
        FAST_BILINEAR_X86
2373
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2374
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2375
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2376

    
2377

    
2378
        "add        $2, %%"REG_a"            \n\t"
2379
        "cmp        %2, %%"REG_a"            \n\t"
2380
        " jb        1b                       \n\t"
2381

    
2382

    
2383
        :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2384
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2385
    );
2386
#if COMPILE_TEMPLATE_MMX2
2387
    } //if MMX2 can't be used
2388
#endif
2389
#else
2390
    int i;
2391
    unsigned int xpos=0;
2392
    for (i=0;i<dstWidth;i++) {
2393
        register unsigned int xx=xpos>>16;
2394
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2395
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2396
        xpos+=xInc;
2397
    }
2398
#endif /* ARCH_X86 */
2399
}
2400

    
2401
      // *** horizontal scale Y line to temp buffer
2402
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2403
                                   const int16_t *hLumFilter,
2404
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2405
                                   uint8_t *formatConvBuffer,
2406
                                   uint32_t *pal, int isAlpha)
2407
{
2408
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2409
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2410

    
2411
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2412

    
2413
    if (toYV12) {
2414
        toYV12(formatConvBuffer, src, srcW, pal);
2415
        src= formatConvBuffer;
2416
    }
2417

    
2418
    if (!c->hyscale_fast) {
2419
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2420
    } else { // fast bilinear upscale / crap downscale
2421
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2422
    }
2423

    
2424
    if (convertRange)
2425
        convertRange(dst, dstWidth);
2426
}
2427

    
2428
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2429
                                        long dstWidth, const uint8_t *src1,
2430
                                        const uint8_t *src2, int srcW, int xInc)
2431
{
2432
#if ARCH_X86
2433
#if COMPILE_TEMPLATE_MMX2
2434
    int32_t *filterPos = c->hChrFilterPos;
2435
    int16_t *filter    = c->hChrFilter;
2436
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2437
    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2438
    int i;
2439
#if defined(PIC)
2440
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2441
#endif
2442
    if (canMMX2BeUsed) {
2443
        __asm__ volatile(
2444
#if defined(PIC)
2445
            "mov          %%"REG_b", %6         \n\t"
2446
#endif
2447
            "pxor             %%mm7, %%mm7      \n\t"
2448
            "mov                 %0, %%"REG_c"  \n\t"
2449
            "mov                 %1, %%"REG_D"  \n\t"
2450
            "mov                 %2, %%"REG_d"  \n\t"
2451
            "mov                 %3, %%"REG_b"  \n\t"
2452
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2453
            PREFETCH"   (%%"REG_c")             \n\t"
2454
            PREFETCH" 32(%%"REG_c")             \n\t"
2455
            PREFETCH" 64(%%"REG_c")             \n\t"
2456

    
2457
            CALL_MMX2_FILTER_CODE
2458
            CALL_MMX2_FILTER_CODE
2459
            CALL_MMX2_FILTER_CODE
2460
            CALL_MMX2_FILTER_CODE
2461
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2462
            "mov                 %5, %%"REG_c"  \n\t" // src
2463
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2464
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2465
            PREFETCH"   (%%"REG_c")             \n\t"
2466
            PREFETCH" 32(%%"REG_c")             \n\t"
2467
            PREFETCH" 64(%%"REG_c")             \n\t"
2468

    
2469
            CALL_MMX2_FILTER_CODE
2470
            CALL_MMX2_FILTER_CODE
2471
            CALL_MMX2_FILTER_CODE
2472
            CALL_MMX2_FILTER_CODE
2473

    
2474
#if defined(PIC)
2475
            "mov %6, %%"REG_b"    \n\t"
2476
#endif
2477
            :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2478
            "m" (mmx2FilterCode), "m" (src2)
2479
#if defined(PIC)
2480
            ,"m" (ebxsave)
2481
#endif
2482
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2483
#if !defined(PIC)
2484
            ,"%"REG_b
2485
#endif
2486
        );
2487
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2488
            //printf("%d %d %d\n", dstWidth, i, srcW);
2489
            dst[i] = src1[srcW-1]*128;
2490
            dst[i+VOFW] = src2[srcW-1]*128;
2491
        }
2492
    } else {
2493
#endif /* COMPILE_TEMPLATE_MMX2 */
2494
        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2495
        uint16_t xInc_mask = xInc & 0xffff;
2496
        x86_reg dstWidth_reg = dstWidth;
2497
        __asm__ volatile(
2498
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2499
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2500
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2501
            ".p2align    4                          \n\t"
2502
            "1:                                     \n\t"
2503
            "mov        %0, %%"REG_S"               \n\t"
2504
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2505
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2506
            FAST_BILINEAR_X86
2507
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2508

    
2509
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2510
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2511
            FAST_BILINEAR_X86
2512
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2513

    
2514
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2515
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2516
            "add        $1, %%"REG_a"               \n\t"
2517
            "cmp        %2, %%"REG_a"               \n\t"
2518
            " jb        1b                          \n\t"
2519

    
2520
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2521
which is needed to support GCC 4.0. */
2522
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2523
            :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2524
#else
2525
            :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2526
#endif
2527
            "r" (src2)
2528
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2529
        );
2530
#if COMPILE_TEMPLATE_MMX2
2531
    } //if MMX2 can't be used
2532
#endif
2533
#else
2534
    int i;
2535
    unsigned int xpos=0;
2536
    for (i=0;i<dstWidth;i++) {
2537
        register unsigned int xx=xpos>>16;
2538
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2539
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2540
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2541
        /* slower
2542
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2543
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2544
        */
2545
        xpos+=xInc;
2546
    }
2547
#endif /* ARCH_X86 */
2548
}
2549

    
2550
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2551
                                   int srcW, int xInc, const int16_t *hChrFilter,
2552
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2553
                                   uint8_t *formatConvBuffer,
2554
                                   uint32_t *pal)
2555
{
2556

    
2557
    src1 += c->chrSrcOffset;
2558
    src2 += c->chrSrcOffset;
2559

    
2560
    if (c->chrToYV12) {
2561
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2562
        src1= formatConvBuffer;
2563
        src2= formatConvBuffer+VOFW;
2564
    }
2565

    
2566
    if (!c->hcscale_fast) {
2567
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2568
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2569
    } else { // fast bilinear upscale / crap downscale
2570
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2571
    }
2572

    
2573
    if (c->chrConvertRange)
2574
        c->chrConvertRange(dst, dstWidth);
2575
}
2576

    
2577
#define DEBUG_SWSCALE_BUFFERS 0
2578
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2579

    
2580
static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2581
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2582
{
2583
    /* load a few things into local vars to make the code more readable? and faster */
2584
    const int srcW= c->srcW;
2585
    const int dstW= c->dstW;
2586
    const int dstH= c->dstH;
2587
    const int chrDstW= c->chrDstW;
2588
    const int chrSrcW= c->chrSrcW;
2589
    const int lumXInc= c->lumXInc;
2590
    const int chrXInc= c->chrXInc;
2591
    const enum PixelFormat dstFormat= c->dstFormat;
2592
    const int flags= c->flags;
2593
    int16_t *vLumFilterPos= c->vLumFilterPos;
2594
    int16_t *vChrFilterPos= c->vChrFilterPos;
2595
    int16_t *hLumFilterPos= c->hLumFilterPos;
2596
    int16_t *hChrFilterPos= c->hChrFilterPos;
2597
    int16_t *vLumFilter= c->vLumFilter;
2598
    int16_t *vChrFilter= c->vChrFilter;
2599
    int16_t *hLumFilter= c->hLumFilter;
2600
    int16_t *hChrFilter= c->hChrFilter;
2601
    int32_t *lumMmxFilter= c->lumMmxFilter;
2602
    int32_t *chrMmxFilter= c->chrMmxFilter;
2603
    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2604
    const int vLumFilterSize= c->vLumFilterSize;
2605
    const int vChrFilterSize= c->vChrFilterSize;
2606
    const int hLumFilterSize= c->hLumFilterSize;
2607
    const int hChrFilterSize= c->hChrFilterSize;
2608
    int16_t **lumPixBuf= c->lumPixBuf;
2609
    int16_t **chrPixBuf= c->chrPixBuf;
2610
    int16_t **alpPixBuf= c->alpPixBuf;
2611
    const int vLumBufSize= c->vLumBufSize;
2612
    const int vChrBufSize= c->vChrBufSize;
2613
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2614
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2615
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2616
    int lastDstY;
2617
    uint32_t *pal=c->pal_yuv;
2618

    
2619
    /* vars which will change and which we need to store back in the context */
2620
    int dstY= c->dstY;
2621
    int lumBufIndex= c->lumBufIndex;
2622
    int chrBufIndex= c->chrBufIndex;
2623
    int lastInLumBuf= c->lastInLumBuf;
2624
    int lastInChrBuf= c->lastInChrBuf;
2625

    
2626
    if (isPacked(c->srcFormat)) {
2627
        src[0]=
2628
        src[1]=
2629
        src[2]=
2630
        src[3]= src[0];
2631
        srcStride[0]=
2632
        srcStride[1]=
2633
        srcStride[2]=
2634
        srcStride[3]= srcStride[0];
2635
    }
2636
    srcStride[1]<<= c->vChrDrop;
2637
    srcStride[2]<<= c->vChrDrop;
2638

    
2639
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2640
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2641
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2642
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2643
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2644
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2645
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2646

    
2647
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2648
        static int warnedAlready=0; //FIXME move this into the context perhaps
2649
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2650
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2651
                   "         ->cannot do aligned memory accesses anymore\n");
2652
            warnedAlready=1;
2653
        }
2654
    }
2655

    
2656
    /* Note the user might start scaling the picture in the middle so this
2657
       will not get executed. This is not really intended but works
2658
       currently, so people might do it. */
2659
    if (srcSliceY ==0) {
2660
        lumBufIndex=-1;
2661
        chrBufIndex=-1;
2662
        dstY=0;
2663
        lastInLumBuf= -1;
2664
        lastInChrBuf= -1;
2665
    }
2666

    
2667
    lastDstY= dstY;
2668

    
2669
    for (;dstY < dstH; dstY++) {
2670
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2671
        const int chrDstY= dstY>>c->chrDstVSubSample;
2672
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2673
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2674
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2675

    
2676
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2677
        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2678
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2679
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2680
        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2681
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2682
        int enough_lines;
2683

    
2684
        //handle holes (FAST_BILINEAR & weird filters)
2685
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2686
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2687
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2688
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2689

    
2690
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2691
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2692
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2693
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2694
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2695

    
2696
        // Do we have enough lines in this slice to output the dstY line
2697
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2698

    
2699
        if (!enough_lines) {
2700
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2701
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2702
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2703
                                            lastLumSrcY, lastChrSrcY);
2704
        }
2705

    
2706
        //Do horizontal scaling
2707
        while(lastInLumBuf < lastLumSrcY) {
2708
            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2709
            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2710
            lumBufIndex++;
2711
            assert(lumBufIndex < 2*vLumBufSize);
2712
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2713
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2714
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2715
                            hLumFilter, hLumFilterPos, hLumFilterSize,
2716
                            formatConvBuffer,
2717
                            pal, 0);
2718
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2719
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2720
                                hLumFilter, hLumFilterPos, hLumFilterSize,
2721
                                formatConvBuffer,
2722
                                pal, 1);
2723
            lastInLumBuf++;
2724
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2725
                               lumBufIndex,    lastInLumBuf);
2726
        }
2727
        while(lastInChrBuf < lastChrSrcY) {
2728
            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2729
            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2730
            chrBufIndex++;
2731
            assert(chrBufIndex < 2*vChrBufSize);
2732
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2733
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2734
            //FIXME replace parameters through context struct (some at least)
2735

    
2736
            if (c->needs_hcscale)
2737
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2738
                                hChrFilter, hChrFilterPos, hChrFilterSize,
2739
                                formatConvBuffer,
2740
                                pal);
2741
            lastInChrBuf++;
2742
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2743
                               chrBufIndex,    lastInChrBuf);
2744
        }
2745
        //wrap buf index around to stay inside the ring buffer
2746
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2747
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2748
        if (!enough_lines)
2749
            break; //we can't output a dstY line so let's try with the next slice
2750

    
2751
#if COMPILE_TEMPLATE_MMX
2752
        c->blueDither= ff_dither8[dstY&1];
2753
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2754
            c->greenDither= ff_dither8[dstY&1];
2755
        else
2756
            c->greenDither= ff_dither4[dstY&1];
2757
        c->redDither= ff_dither8[(dstY+1)&1];
2758
#endif
2759
        if (dstY < dstH-2) {
2760
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2761
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2762
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2763
#if COMPILE_TEMPLATE_MMX
2764
            int i;
2765
            if (flags & SWS_ACCURATE_RND) {
2766
                int s= APCK_SIZE / 8;
2767
                for (i=0; i<vLumFilterSize; i+=2) {
2768
                    *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2769
                    *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2770
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2771
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2772
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2773
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2774
                        *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2775
                        *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2776
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2777
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2778
                    }
2779
                }
2780
                for (i=0; i<vChrFilterSize; i+=2) {
2781
                    *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2782
                    *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2783
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2784
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2785
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2786
                }
2787
            } else {
2788
                for (i=0; i<vLumFilterSize; i++) {
2789
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2790
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2791
                    lumMmxFilter[4*i+2]=
2792
                    lumMmxFilter[4*i+3]=
2793
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2794
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2795
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2796
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2797
                        alpMmxFilter[4*i+2]=
2798
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2799
                    }
2800
                }
2801
                for (i=0; i<vChrFilterSize; i++) {
2802
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2803
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2804
                    chrMmxFilter[4*i+2]=
2805
                    chrMmxFilter[4*i+3]=
2806
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2807
                }
2808
            }
2809
#endif
2810
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2811
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2812
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2813
                c->yuv2nv12X(c,
2814
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2815
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2816
                             dest, uDest, dstW, chrDstW, dstFormat);
2817
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2818
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2819
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2820
                if (is16BPS(dstFormat)) {
2821
                    yuv2yuvX16inC(
2822
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2823
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2824
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2825
                                  dstFormat);
2826
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2827
                    const int16_t *lumBuf = lumSrcPtr[0];
2828
                    const int16_t *chrBuf= chrSrcPtr[0];
2829
                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2830
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2831
                } else { //General YV12
2832
                    c->yuv2yuvX(c,
2833
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2834
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2836
                }
2837
            } else {
2838
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2839
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2840
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2841
                    int chrAlpha= vChrFilter[2*dstY+1];
2842
                    if(flags & SWS_FULL_CHR_H_INT) {
2843
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2844
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2845
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2846
                                         alpSrcPtr, dest, dstW, dstY);
2847
                    } else {
2848
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2849
                                       alpPixBuf ? *alpSrcPtr : NULL,
2850
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2851
                    }
2852
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2853
                    int lumAlpha= vLumFilter[2*dstY+1];
2854
                    int chrAlpha= vChrFilter[2*dstY+1];
2855
                    lumMmxFilter[2]=
2856
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2857
                    chrMmxFilter[2]=
2858
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2859
                    if(flags & SWS_FULL_CHR_H_INT) {
2860
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2861
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2862
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2863
                                         alpSrcPtr, dest, dstW, dstY);
2864
                    } else {
2865
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2866
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2867
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2868
                    }
2869
                } else { //general RGB
2870
                    if(flags & SWS_FULL_CHR_H_INT) {
2871
                        yuv2rgbXinC_full(c,
2872
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2873
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2874
                                         alpSrcPtr, dest, dstW, dstY);
2875
                    } else {
2876
                        c->yuv2packedX(c,
2877
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879
                                       alpSrcPtr, dest, dstW, dstY);
2880
                    }
2881
                }
2882
            }
2883
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2884
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2885
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2886
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2887
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2888
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2889
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2890
                yuv2nv12XinC(
2891
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2892
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893
                             dest, uDest, dstW, chrDstW, dstFormat);
2894
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2895
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2896
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2897
                if (is16BPS(dstFormat)) {
2898
                    yuv2yuvX16inC(
2899
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2900
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2901
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2902
                                  dstFormat);
2903
                } else {
2904
                    yuv2yuvXinC(
2905
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2906
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2907
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2908
                }
2909
            } else {
2910
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2911
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2912
                if(flags & SWS_FULL_CHR_H_INT) {
2913
                    yuv2rgbXinC_full(c,
2914
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2915
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2916
                                     alpSrcPtr, dest, dstW, dstY);
2917
                } else {
2918
                    yuv2packedXinC(c,
2919
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2920
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2921
                                   alpSrcPtr, dest, dstW, dstY);
2922
                }
2923
            }
2924
        }
2925
    }
2926

    
2927
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2928
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2929

    
2930
#if COMPILE_TEMPLATE_MMX
2931
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2932
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2933
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2934
    else                             __asm__ volatile("emms"  :::"memory");
2935
#endif
2936
    /* store changed local vars back in the context */
2937
    c->dstY= dstY;
2938
    c->lumBufIndex= lumBufIndex;
2939
    c->chrBufIndex= chrBufIndex;
2940
    c->lastInLumBuf= lastInLumBuf;
2941
    c->lastInChrBuf= lastInChrBuf;
2942

    
2943
    return dstY - lastDstY;
2944
}
2945

    
2946
static void RENAME(sws_init_swScale)(SwsContext *c)
2947
{
2948
    enum PixelFormat srcFormat = c->srcFormat;
2949

    
2950
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2951
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2952
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2953
    c->yuv2packed1  = RENAME(yuv2packed1 );
2954
    c->yuv2packed2  = RENAME(yuv2packed2 );
2955
    c->yuv2packedX  = RENAME(yuv2packedX );
2956

    
2957
    c->hScale       = RENAME(hScale      );
2958

    
2959
#if COMPILE_TEMPLATE_MMX
2960
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2961
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2962
#else
2963
    if (c->flags & SWS_FAST_BILINEAR)
2964
#endif
2965
    {
2966
        c->hyscale_fast = RENAME(hyscale_fast);
2967
        c->hcscale_fast = RENAME(hcscale_fast);
2968
    }
2969

    
2970
    c->chrToYV12 = NULL;
2971
    switch(srcFormat) {
2972
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2973
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2974
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2975
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2976
        case PIX_FMT_RGB8     :
2977
        case PIX_FMT_BGR8     :
2978
        case PIX_FMT_PAL8     :
2979
        case PIX_FMT_BGR4_BYTE:
2980
        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2981
        case PIX_FMT_YUV420P9 : c->chrToYV12 = (void*)RENAME(yuv9ToUV ); break;
2982
        case PIX_FMT_YUV420P10: c->chrToYV12 = (void*)RENAME(yuv10ToUV); break;
2983
        case PIX_FMT_YUV420P16BE:
2984
        case PIX_FMT_YUV422P16BE:
2985
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2986
        case PIX_FMT_YUV420P16LE:
2987
        case PIX_FMT_YUV422P16LE:
2988
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2989
    }
2990
    if (c->chrSrcHSubSample) {
2991
        switch(srcFormat) {
2992
        case PIX_FMT_RGB48BE:
2993
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2994
        case PIX_FMT_BGR48BE:
2995
        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
2996
        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half;  break;
2997
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
2998
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2999
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
3000
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
3001
        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half;  break;
3002
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
3003
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
3004
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
3005
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
3006
        }
3007
    } else {
3008
        switch(srcFormat) {
3009
        case PIX_FMT_RGB48BE:
3010
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
3011
        case PIX_FMT_BGR48BE:
3012
        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
3013
        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV;  break;
3014
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
3015
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
3016
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
3017
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
3018
        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV;  break;
3019
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
3020
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
3021
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
3022
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
3023
        }
3024
    }
3025

    
3026
    c->lumToYV12 = NULL;
3027
    c->alpToYV12 = NULL;
3028
    switch (srcFormat) {
3029
    case PIX_FMT_YUV420P9 : c->lumToYV12 = (void*)RENAME(yuv9ToY ); break;
3030
    case PIX_FMT_YUV420P10: c->lumToYV12 = (void*)RENAME(yuv10ToY); break;
3031
    case PIX_FMT_YUYV422  :
3032
    case PIX_FMT_YUV420P16BE:
3033
    case PIX_FMT_YUV422P16BE:
3034
    case PIX_FMT_YUV444P16BE:
3035
    case PIX_FMT_GRAY8A   :
3036
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3037
    case PIX_FMT_UYVY422  :
3038
    case PIX_FMT_YUV420P16LE:
3039
    case PIX_FMT_YUV422P16LE:
3040
    case PIX_FMT_YUV444P16LE:
3041
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3042
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
3043
    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
3044
    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
3045
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
3046
    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
3047
    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
3048
    case PIX_FMT_RGB8     :
3049
    case PIX_FMT_BGR8     :
3050
    case PIX_FMT_PAL8     :
3051
    case PIX_FMT_BGR4_BYTE:
3052
    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3053
    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3054
    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3055
    case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY;  break;
3056
    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
3057
    case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY;  break;
3058
    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
3059
    case PIX_FMT_RGB48BE:
3060
    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3061
    case PIX_FMT_BGR48BE:
3062
    case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
3063
    }
3064
    if (c->alpPixBuf) {
3065
        switch (srcFormat) {
3066
        case PIX_FMT_RGB32  :
3067
        case PIX_FMT_RGB32_1:
3068
        case PIX_FMT_BGR32  :
3069
        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3070
        case PIX_FMT_GRAY8A : c->alpToYV12 = RENAME(yuy2ToY); break;
3071
        case PIX_FMT_PAL8   : c->alpToYV12 = palToA; break;
3072
        }
3073
    }
3074

    
3075
    switch (srcFormat) {
3076
    case PIX_FMT_GRAY8A :
3077
        c->alpSrcOffset = 1;
3078
        break;
3079
    case PIX_FMT_RGB32  :
3080
    case PIX_FMT_BGR32  :
3081
        c->alpSrcOffset = 3;
3082
        break;
3083
    case PIX_FMT_RGB48LE:
3084
    case PIX_FMT_BGR48LE:
3085
        c->lumSrcOffset = 1;
3086
        c->chrSrcOffset = 1;
3087
        c->alpSrcOffset = 1;
3088
        break;
3089
    }
3090

    
3091
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3092
        if (c->srcRange) {
3093
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
3094
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
3095
        } else {
3096
            c->lumConvertRange = RENAME(lumRangeToJpeg);
3097
            c->chrConvertRange = RENAME(chrRangeToJpeg);
3098
        }
3099
    }
3100

    
3101
    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3102
          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3103
        c->needs_hcscale = 1;
3104
}