Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 40fa5140

History | View | Annotate | Download (137 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29

    
30
#if HAVE_AMD3DNOW
31
#define PREFETCH  "prefetch"
32
#define PREFETCHW "prefetchw"
33
#elif HAVE_MMX2
34
#define PREFETCH "prefetchnta"
35
#define PREFETCHW "prefetcht0"
36
#else
37
#define PREFETCH  " # nop"
38
#define PREFETCHW " # nop"
39
#endif
40

    
41
#if HAVE_MMX2
42
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43
#elif HAVE_AMD3DNOW
44
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45
#endif
46

    
47
#if HAVE_MMX2
48
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49
#else
50
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51
#endif
52
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
53

    
54
#if HAVE_ALTIVEC
55
#include "ppc/swscale_altivec_template.c"
56
#endif
57

    
58
#define YSCALEYUV2YV12X(x, offset, dest, width) \
59
    __asm__ volatile(\
60
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
61
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
62
    "movq                             %%mm3, %%mm4      \n\t"\
63
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
64
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
    ASMALIGN(4) /* FIXME Unroll? */\
66
    "1:                                                 \n\t"\
67
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
68
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
69
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
70
    "add                                $16, %%"REG_d"  \n\t"\
71
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
72
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
73
    "pmulhw                           %%mm0, %%mm2      \n\t"\
74
    "pmulhw                           %%mm0, %%mm5      \n\t"\
75
    "paddw                            %%mm2, %%mm3      \n\t"\
76
    "paddw                            %%mm5, %%mm4      \n\t"\
77
    " jnz                                1b             \n\t"\
78
    "psraw                               $3, %%mm3      \n\t"\
79
    "psraw                               $3, %%mm4      \n\t"\
80
    "packuswb                         %%mm4, %%mm3      \n\t"\
81
    MOVNTQ(%%mm3, (%1, %%REGa))\
82
    "add                                 $8, %%"REG_a"  \n\t"\
83
    "cmp                                 %2, %%"REG_a"  \n\t"\
84
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
85
    "movq                             %%mm3, %%mm4      \n\t"\
86
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
87
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
88
    "jb                                  1b             \n\t"\
89
    :: "r" (&c->redDither),\
90
    "r" (dest), "g" (width)\
91
    : "%"REG_a, "%"REG_d, "%"REG_S\
92
    );
93

    
94
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95
    __asm__ volatile(\
96
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
97
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
98
    "pxor                             %%mm4, %%mm4      \n\t"\
99
    "pxor                             %%mm5, %%mm5      \n\t"\
100
    "pxor                             %%mm6, %%mm6      \n\t"\
101
    "pxor                             %%mm7, %%mm7      \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    ASMALIGN(4) \
104
    "1:                                                 \n\t"\
105
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
106
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
107
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
108
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
109
    "movq                             %%mm0, %%mm3      \n\t"\
110
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
111
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
112
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
113
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
114
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
115
    "paddd                            %%mm0, %%mm4      \n\t"\
116
    "paddd                            %%mm3, %%mm5      \n\t"\
117
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
118
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
119
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
120
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
121
    "movq                             %%mm2, %%mm0      \n\t"\
122
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
123
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
124
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
125
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
126
    "paddd                            %%mm2, %%mm6      \n\t"\
127
    "paddd                            %%mm0, %%mm7      \n\t"\
128
    " jnz                                1b             \n\t"\
129
    "psrad                              $16, %%mm4      \n\t"\
130
    "psrad                              $16, %%mm5      \n\t"\
131
    "psrad                              $16, %%mm6      \n\t"\
132
    "psrad                              $16, %%mm7      \n\t"\
133
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
134
    "packssdw                         %%mm5, %%mm4      \n\t"\
135
    "packssdw                         %%mm7, %%mm6      \n\t"\
136
    "paddw                            %%mm0, %%mm4      \n\t"\
137
    "paddw                            %%mm0, %%mm6      \n\t"\
138
    "psraw                               $3, %%mm4      \n\t"\
139
    "psraw                               $3, %%mm6      \n\t"\
140
    "packuswb                         %%mm6, %%mm4      \n\t"\
141
    MOVNTQ(%%mm4, (%1, %%REGa))\
142
    "add                                 $8, %%"REG_a"  \n\t"\
143
    "cmp                                 %2, %%"REG_a"  \n\t"\
144
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
145
    "pxor                             %%mm4, %%mm4      \n\t"\
146
    "pxor                             %%mm5, %%mm5      \n\t"\
147
    "pxor                             %%mm6, %%mm6      \n\t"\
148
    "pxor                             %%mm7, %%mm7      \n\t"\
149
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
150
    "jb                                  1b             \n\t"\
151
    :: "r" (&c->redDither),\
152
    "r" (dest), "g" (width)\
153
    : "%"REG_a, "%"REG_d, "%"REG_S\
154
    );
155

    
156
#define YSCALEYUV2YV121 \
157
    "mov %2, %%"REG_a"                    \n\t"\
158
    ASMALIGN(4) /* FIXME Unroll? */\
159
    "1:                                   \n\t"\
160
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
161
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
162
    "psraw                 $7, %%mm0      \n\t"\
163
    "psraw                 $7, %%mm1      \n\t"\
164
    "packuswb           %%mm1, %%mm0      \n\t"\
165
    MOVNTQ(%%mm0, (%1, %%REGa))\
166
    "add                   $8, %%"REG_a"  \n\t"\
167
    "jnc                   1b             \n\t"
168

    
169
#define YSCALEYUV2YV121_ACCURATE \
170
    "mov %2, %%"REG_a"                    \n\t"\
171
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
172
    "psrlw                 $15, %%mm7     \n\t"\
173
    "psllw                  $6, %%mm7     \n\t"\
174
    ASMALIGN(4) /* FIXME Unroll? */\
175
    "1:                                   \n\t"\
176
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
177
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
178
    "paddsw             %%mm7, %%mm0      \n\t"\
179
    "paddsw             %%mm7, %%mm1      \n\t"\
180
    "psraw                 $7, %%mm0      \n\t"\
181
    "psraw                 $7, %%mm1      \n\t"\
182
    "packuswb           %%mm1, %%mm0      \n\t"\
183
    MOVNTQ(%%mm0, (%1, %%REGa))\
184
    "add                   $8, %%"REG_a"  \n\t"\
185
    "jnc                   1b             \n\t"
186

    
187
/*
188
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190
       "r" (dest), "m" (dstW),
191
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193
*/
194
#define YSCALEYUV2PACKEDX_UV \
195
    __asm__ volatile(\
196
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
197
    ASMALIGN(4)\
198
    "nop                                            \n\t"\
199
    "1:                                             \n\t"\
200
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
201
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
202
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
203
    "movq                      %%mm3, %%mm4         \n\t"\
204
    ASMALIGN(4)\
205
    "2:                                             \n\t"\
206
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
207
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
208
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
209
    "add                         $16, %%"REG_d"     \n\t"\
210
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
211
    "pmulhw                    %%mm0, %%mm2         \n\t"\
212
    "pmulhw                    %%mm0, %%mm5         \n\t"\
213
    "paddw                     %%mm2, %%mm3         \n\t"\
214
    "paddw                     %%mm5, %%mm4         \n\t"\
215
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
216
    " jnz                         2b                \n\t"\
217

    
218
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
220
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
221
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
222
    "movq                    "#dst1", "#dst2"       \n\t"\
223
    ASMALIGN(4)\
224
    "2:                                             \n\t"\
225
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
226
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
227
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
228
    "add                         $16, %%"REG_d"            \n\t"\
229
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
230
    "pmulhw                 "#coeff", "#src1"       \n\t"\
231
    "pmulhw                 "#coeff", "#src2"       \n\t"\
232
    "paddw                   "#src1", "#dst1"       \n\t"\
233
    "paddw                   "#src2", "#dst2"       \n\t"\
234
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
235
    " jnz                         2b                \n\t"\
236

    
237
#define YSCALEYUV2PACKEDX \
238
    YSCALEYUV2PACKEDX_UV \
239
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240

    
241
#define YSCALEYUV2PACKEDX_END                 \
242
    :: "r" (&c->redDither),                   \
243
        "m" (dummy), "m" (dummy), "m" (dummy),\
244
        "r" (dest), "m" (dstW)                \
245
    : "%"REG_a, "%"REG_d, "%"REG_S            \
246
    );
247

    
248
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
249
    __asm__ volatile(\
250
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
251
    ASMALIGN(4)\
252
    "nop                                            \n\t"\
253
    "1:                                             \n\t"\
254
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
255
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
256
    "pxor                      %%mm4, %%mm4         \n\t"\
257
    "pxor                      %%mm5, %%mm5         \n\t"\
258
    "pxor                      %%mm6, %%mm6         \n\t"\
259
    "pxor                      %%mm7, %%mm7         \n\t"\
260
    ASMALIGN(4)\
261
    "2:                                             \n\t"\
262
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
263
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
264
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
265
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
266
    "movq                      %%mm0, %%mm3         \n\t"\
267
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
268
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
269
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
270
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
271
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
272
    "paddd                     %%mm0, %%mm4         \n\t"\
273
    "paddd                     %%mm3, %%mm5         \n\t"\
274
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
275
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
276
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
277
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
278
    "movq                      %%mm2, %%mm0         \n\t"\
279
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
280
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
282
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
283
    "paddd                     %%mm2, %%mm6         \n\t"\
284
    "paddd                     %%mm0, %%mm7         \n\t"\
285
    " jnz                         2b                \n\t"\
286
    "psrad                       $16, %%mm4         \n\t"\
287
    "psrad                       $16, %%mm5         \n\t"\
288
    "psrad                       $16, %%mm6         \n\t"\
289
    "psrad                       $16, %%mm7         \n\t"\
290
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
291
    "packssdw                  %%mm5, %%mm4         \n\t"\
292
    "packssdw                  %%mm7, %%mm6         \n\t"\
293
    "paddw                     %%mm0, %%mm4         \n\t"\
294
    "paddw                     %%mm0, %%mm6         \n\t"\
295
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
296
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
297

    
298
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
300
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
301
    "pxor                      %%mm1, %%mm1         \n\t"\
302
    "pxor                      %%mm5, %%mm5         \n\t"\
303
    "pxor                      %%mm7, %%mm7         \n\t"\
304
    "pxor                      %%mm6, %%mm6         \n\t"\
305
    ASMALIGN(4)\
306
    "2:                                             \n\t"\
307
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
308
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
309
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
310
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
311
    "movq                      %%mm0, %%mm3         \n\t"\
312
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
313
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
314
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
315
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
316
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
317
    "paddd                     %%mm0, %%mm1         \n\t"\
318
    "paddd                     %%mm3, %%mm5         \n\t"\
319
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
320
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
321
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
322
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
323
    "movq                      %%mm2, %%mm0         \n\t"\
324
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
325
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
326
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
327
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
328
    "paddd                     %%mm2, %%mm7         \n\t"\
329
    "paddd                     %%mm0, %%mm6         \n\t"\
330
    " jnz                         2b                \n\t"\
331
    "psrad                       $16, %%mm1         \n\t"\
332
    "psrad                       $16, %%mm5         \n\t"\
333
    "psrad                       $16, %%mm7         \n\t"\
334
    "psrad                       $16, %%mm6         \n\t"\
335
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
336
    "packssdw                  %%mm5, %%mm1         \n\t"\
337
    "packssdw                  %%mm6, %%mm7         \n\t"\
338
    "paddw                     %%mm0, %%mm1         \n\t"\
339
    "paddw                     %%mm0, %%mm7         \n\t"\
340
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
341
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
342

    
343
#define YSCALEYUV2PACKEDX_ACCURATE \
344
    YSCALEYUV2PACKEDX_ACCURATE_UV \
345
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346

    
347
#define YSCALEYUV2RGBX \
348
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
349
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
350
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
351
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
352
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
353
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
354
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
356
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
357
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
358
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
359
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
360
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
361
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362
    "paddw           %%mm3, %%mm4       \n\t"\
363
    "movq            %%mm2, %%mm0       \n\t"\
364
    "movq            %%mm5, %%mm6       \n\t"\
365
    "movq            %%mm4, %%mm3       \n\t"\
366
    "punpcklwd       %%mm2, %%mm2       \n\t"\
367
    "punpcklwd       %%mm5, %%mm5       \n\t"\
368
    "punpcklwd       %%mm4, %%mm4       \n\t"\
369
    "paddw           %%mm1, %%mm2       \n\t"\
370
    "paddw           %%mm1, %%mm5       \n\t"\
371
    "paddw           %%mm1, %%mm4       \n\t"\
372
    "punpckhwd       %%mm0, %%mm0       \n\t"\
373
    "punpckhwd       %%mm6, %%mm6       \n\t"\
374
    "punpckhwd       %%mm3, %%mm3       \n\t"\
375
    "paddw           %%mm7, %%mm0       \n\t"\
376
    "paddw           %%mm7, %%mm6       \n\t"\
377
    "paddw           %%mm7, %%mm3       \n\t"\
378
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379
    "packuswb        %%mm0, %%mm2       \n\t"\
380
    "packuswb        %%mm6, %%mm5       \n\t"\
381
    "packuswb        %%mm3, %%mm4       \n\t"\
382

    
383
#define REAL_YSCALEYUV2PACKED(index, c) \
384
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
385
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
386
    "psraw                $3, %%mm0                           \n\t"\
387
    "psraw                $3, %%mm1                           \n\t"\
388
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390
    "xor            "#index", "#index"                        \n\t"\
391
    ASMALIGN(4)\
392
    "1:                                 \n\t"\
393
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
394
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
395
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
396
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
397
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
400
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
407
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
408
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
409
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
410
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
411
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
412
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418

    
419
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
420

    
421
#define REAL_YSCALEYUV2RGB_UV(index, c) \
422
    "xor            "#index", "#index"  \n\t"\
423
    ASMALIGN(4)\
424
    "1:                                 \n\t"\
425
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
428
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
429
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
432
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
439
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
440
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
441
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
442
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
443
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
444
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445

    
446
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
448
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
449
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
450
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
451
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
452
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459

    
460
#define REAL_YSCALEYUV2RGB_COEFF(c) \
461
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
462
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
463
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
464
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
465
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
466
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
467
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468
    "paddw             %%mm3, %%mm4     \n\t"\
469
    "movq              %%mm2, %%mm0     \n\t"\
470
    "movq              %%mm5, %%mm6     \n\t"\
471
    "movq              %%mm4, %%mm3     \n\t"\
472
    "punpcklwd         %%mm2, %%mm2     \n\t"\
473
    "punpcklwd         %%mm5, %%mm5     \n\t"\
474
    "punpcklwd         %%mm4, %%mm4     \n\t"\
475
    "paddw             %%mm1, %%mm2     \n\t"\
476
    "paddw             %%mm1, %%mm5     \n\t"\
477
    "paddw             %%mm1, %%mm4     \n\t"\
478
    "punpckhwd         %%mm0, %%mm0     \n\t"\
479
    "punpckhwd         %%mm6, %%mm6     \n\t"\
480
    "punpckhwd         %%mm3, %%mm3     \n\t"\
481
    "paddw             %%mm7, %%mm0     \n\t"\
482
    "paddw             %%mm7, %%mm6     \n\t"\
483
    "paddw             %%mm7, %%mm3     \n\t"\
484
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485
    "packuswb          %%mm0, %%mm2     \n\t"\
486
    "packuswb          %%mm6, %%mm5     \n\t"\
487
    "packuswb          %%mm3, %%mm4     \n\t"\
488

    
489
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490

    
491
#define YSCALEYUV2RGB(index, c) \
492
    REAL_YSCALEYUV2RGB_UV(index, c) \
493
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494
    REAL_YSCALEYUV2RGB_COEFF(c)
495

    
496
#define REAL_YSCALEYUV2PACKED1(index, c) \
497
    "xor            "#index", "#index"  \n\t"\
498
    ASMALIGN(4)\
499
    "1:                                 \n\t"\
500
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
501
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
502
    "psraw                $7, %%mm3     \n\t" \
503
    "psraw                $7, %%mm4     \n\t" \
504
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
505
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
506
    "psraw                $7, %%mm1     \n\t" \
507
    "psraw                $7, %%mm7     \n\t" \
508

    
509
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
510

    
511
#define REAL_YSCALEYUV2RGB1(index, c) \
512
    "xor            "#index", "#index"  \n\t"\
513
    ASMALIGN(4)\
514
    "1:                                 \n\t"\
515
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
520
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
521
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
522
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
523
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
524
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
525
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
527
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
528
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
531
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
532
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
533
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
534
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
535
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
536
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537
    "paddw             %%mm3, %%mm4     \n\t"\
538
    "movq              %%mm2, %%mm0     \n\t"\
539
    "movq              %%mm5, %%mm6     \n\t"\
540
    "movq              %%mm4, %%mm3     \n\t"\
541
    "punpcklwd         %%mm2, %%mm2     \n\t"\
542
    "punpcklwd         %%mm5, %%mm5     \n\t"\
543
    "punpcklwd         %%mm4, %%mm4     \n\t"\
544
    "paddw             %%mm1, %%mm2     \n\t"\
545
    "paddw             %%mm1, %%mm5     \n\t"\
546
    "paddw             %%mm1, %%mm4     \n\t"\
547
    "punpckhwd         %%mm0, %%mm0     \n\t"\
548
    "punpckhwd         %%mm6, %%mm6     \n\t"\
549
    "punpckhwd         %%mm3, %%mm3     \n\t"\
550
    "paddw             %%mm7, %%mm0     \n\t"\
551
    "paddw             %%mm7, %%mm6     \n\t"\
552
    "paddw             %%mm7, %%mm3     \n\t"\
553
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554
    "packuswb          %%mm0, %%mm2     \n\t"\
555
    "packuswb          %%mm6, %%mm5     \n\t"\
556
    "packuswb          %%mm3, %%mm4     \n\t"\
557

    
558
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
559

    
560
#define REAL_YSCALEYUV2PACKED1b(index, c) \
561
    "xor "#index", "#index"             \n\t"\
562
    ASMALIGN(4)\
563
    "1:                                 \n\t"\
564
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
565
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
566
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
568
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570
    "psrlw                $8, %%mm3     \n\t" \
571
    "psrlw                $8, %%mm4     \n\t" \
572
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
573
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
574
    "psraw                $7, %%mm1     \n\t" \
575
    "psraw                $7, %%mm7     \n\t"
576
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
577

    
578
// do vertical chrominance interpolation
579
#define REAL_YSCALEYUV2RGB1b(index, c) \
580
    "xor            "#index", "#index"  \n\t"\
581
    ASMALIGN(4)\
582
    "1:                                 \n\t"\
583
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
584
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
585
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
586
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
587
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
590
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
591
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
592
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
593
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
594
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
595
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
596
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
597
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
599
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
600
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
603
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
604
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
605
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
606
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
607
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
608
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609
    "paddw             %%mm3, %%mm4     \n\t"\
610
    "movq              %%mm2, %%mm0     \n\t"\
611
    "movq              %%mm5, %%mm6     \n\t"\
612
    "movq              %%mm4, %%mm3     \n\t"\
613
    "punpcklwd         %%mm2, %%mm2     \n\t"\
614
    "punpcklwd         %%mm5, %%mm5     \n\t"\
615
    "punpcklwd         %%mm4, %%mm4     \n\t"\
616
    "paddw             %%mm1, %%mm2     \n\t"\
617
    "paddw             %%mm1, %%mm5     \n\t"\
618
    "paddw             %%mm1, %%mm4     \n\t"\
619
    "punpckhwd         %%mm0, %%mm0     \n\t"\
620
    "punpckhwd         %%mm6, %%mm6     \n\t"\
621
    "punpckhwd         %%mm3, %%mm3     \n\t"\
622
    "paddw             %%mm7, %%mm0     \n\t"\
623
    "paddw             %%mm7, %%mm6     \n\t"\
624
    "paddw             %%mm7, %%mm3     \n\t"\
625
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626
    "packuswb          %%mm0, %%mm2     \n\t"\
627
    "packuswb          %%mm6, %%mm5     \n\t"\
628
    "packuswb          %%mm3, %%mm4     \n\t"\
629

    
630
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
631

    
632
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
634
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
635
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
636
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
637
    "packuswb          %%mm1, %%mm7     \n\t"
638
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639

    
640
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641
    "movq       "#b", "#q2"     \n\t" /* B */\
642
    "movq       "#r", "#t"      \n\t" /* R */\
643
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
644
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
645
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
646
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
647
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
648
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
649
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
650
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
651
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
652
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
653
\
654
    MOVNTQ(   q0,   (dst, index, 4))\
655
    MOVNTQ(    b,  8(dst, index, 4))\
656
    MOVNTQ(   q2, 16(dst, index, 4))\
657
    MOVNTQ(   q3, 24(dst, index, 4))\
658
\
659
    "add      $8, "#index"      \n\t"\
660
    "cmp "#dstw", "#index"      \n\t"\
661
    " jb      1b                \n\t"
662
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663

    
664
#define REAL_WRITERGB16(dst, dstw, index) \
665
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
666
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
667
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
668
    "psrlq           $3, %%mm2  \n\t"\
669
\
670
    "movq         %%mm2, %%mm1  \n\t"\
671
    "movq         %%mm4, %%mm3  \n\t"\
672
\
673
    "punpcklbw    %%mm7, %%mm3  \n\t"\
674
    "punpcklbw    %%mm5, %%mm2  \n\t"\
675
    "punpckhbw    %%mm7, %%mm4  \n\t"\
676
    "punpckhbw    %%mm5, %%mm1  \n\t"\
677
\
678
    "psllq           $3, %%mm3  \n\t"\
679
    "psllq           $3, %%mm4  \n\t"\
680
\
681
    "por          %%mm3, %%mm2  \n\t"\
682
    "por          %%mm4, %%mm1  \n\t"\
683
\
684
    MOVNTQ(%%mm2,  (dst, index, 2))\
685
    MOVNTQ(%%mm1, 8(dst, index, 2))\
686
\
687
    "add             $8, "#index"   \n\t"\
688
    "cmp        "#dstw", "#index"   \n\t"\
689
    " jb             1b             \n\t"
690
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
691

    
692
#define REAL_WRITERGB15(dst, dstw, index) \
693
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
694
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
695
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
696
    "psrlq           $3, %%mm2  \n\t"\
697
    "psrlq           $1, %%mm5  \n\t"\
698
\
699
    "movq         %%mm2, %%mm1  \n\t"\
700
    "movq         %%mm4, %%mm3  \n\t"\
701
\
702
    "punpcklbw    %%mm7, %%mm3  \n\t"\
703
    "punpcklbw    %%mm5, %%mm2  \n\t"\
704
    "punpckhbw    %%mm7, %%mm4  \n\t"\
705
    "punpckhbw    %%mm5, %%mm1  \n\t"\
706
\
707
    "psllq           $2, %%mm3  \n\t"\
708
    "psllq           $2, %%mm4  \n\t"\
709
\
710
    "por          %%mm3, %%mm2  \n\t"\
711
    "por          %%mm4, %%mm1  \n\t"\
712
\
713
    MOVNTQ(%%mm2,  (dst, index, 2))\
714
    MOVNTQ(%%mm1, 8(dst, index, 2))\
715
\
716
    "add             $8, "#index"   \n\t"\
717
    "cmp        "#dstw", "#index"   \n\t"\
718
    " jb             1b             \n\t"
719
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
720

    
721
#define WRITEBGR24OLD(dst, dstw, index) \
722
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723
    "movq      %%mm2, %%mm1             \n\t" /* B */\
724
    "movq      %%mm5, %%mm6             \n\t" /* R */\
725
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
726
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
727
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
728
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
729
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
730
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
731
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
732
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
733
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
734
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
735
\
736
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
737
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
738
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
739
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
740
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
741
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
742
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
743
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
744
\
745
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
746
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
747
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
748
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
749
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
750
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
751
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
752
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
753
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
754
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
755
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
756
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
757
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
758
\
759
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
760
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
761
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
762
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
763
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
764
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
765
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
766
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
767
\
768
    MOVNTQ(%%mm0,   (dst))\
769
    MOVNTQ(%%mm2,  8(dst))\
770
    MOVNTQ(%%mm3, 16(dst))\
771
    "add         $24, "#dst"            \n\t"\
772
\
773
    "add          $8, "#index"          \n\t"\
774
    "cmp     "#dstw", "#index"          \n\t"\
775
    " jb          1b                    \n\t"
776

    
777
#define WRITEBGR24MMX(dst, dstw, index) \
778
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779
    "movq      %%mm2, %%mm1     \n\t" /* B */\
780
    "movq      %%mm5, %%mm6     \n\t" /* R */\
781
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
782
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
783
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
784
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
785
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
786
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
787
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
788
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
789
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
790
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
791
\
792
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
793
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
794
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
795
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
796
\
797
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
798
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
799
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
800
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
801
\
802
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
803
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
804
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
805
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
806
\
807
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
808
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
809
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
810
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
811
    MOVNTQ(%%mm0, (dst))\
812
\
813
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
814
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
815
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
816
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
817
    MOVNTQ(%%mm6, 8(dst))\
818
\
819
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
820
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
821
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
822
    MOVNTQ(%%mm5, 16(dst))\
823
\
824
    "add         $24, "#dst"    \n\t"\
825
\
826
    "add          $8, "#index"  \n\t"\
827
    "cmp     "#dstw", "#index"  \n\t"\
828
    " jb          1b            \n\t"
829

    
830
#define WRITEBGR24MMX2(dst, dstw, index) \
831
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
835
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
836
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
837
\
838
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
839
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
840
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
841
\
842
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
843
    "por    %%mm1, %%mm6        \n\t"\
844
    "por    %%mm3, %%mm6        \n\t"\
845
    MOVNTQ(%%mm6, (dst))\
846
\
847
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
848
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
849
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
850
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
851
\
852
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
853
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
854
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
855
\
856
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
857
    "por    %%mm3, %%mm6        \n\t"\
858
    MOVNTQ(%%mm6, 8(dst))\
859
\
860
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
861
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
862
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
863
\
864
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
865
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
866
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
867
\
868
    "por    %%mm1, %%mm3        \n\t"\
869
    "por    %%mm3, %%mm6        \n\t"\
870
    MOVNTQ(%%mm6, 16(dst))\
871
\
872
    "add      $24, "#dst"       \n\t"\
873
\
874
    "add       $8, "#index"     \n\t"\
875
    "cmp  "#dstw", "#index"     \n\t"\
876
    " jb       1b               \n\t"
877

    
878
#if HAVE_MMX2
879
#undef WRITEBGR24
880
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
881
#else
882
#undef WRITEBGR24
883
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
884
#endif
885

    
886
#define REAL_WRITEYUY2(dst, dstw, index) \
887
    "packuswb  %%mm3, %%mm3     \n\t"\
888
    "packuswb  %%mm4, %%mm4     \n\t"\
889
    "packuswb  %%mm7, %%mm1     \n\t"\
890
    "punpcklbw %%mm4, %%mm3     \n\t"\
891
    "movq      %%mm1, %%mm7     \n\t"\
892
    "punpcklbw %%mm3, %%mm1     \n\t"\
893
    "punpckhbw %%mm3, %%mm7     \n\t"\
894
\
895
    MOVNTQ(%%mm1, (dst, index, 2))\
896
    MOVNTQ(%%mm7, 8(dst, index, 2))\
897
\
898
    "add          $8, "#index"  \n\t"\
899
    "cmp     "#dstw", "#index"  \n\t"\
900
    " jb          1b            \n\t"
901
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
902

    
903

    
904
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907
{
908
#if HAVE_MMX
909
    if(!(c->flags & SWS_BITEXACT)){
910
        if (c->flags & SWS_ACCURATE_RND){
911
            if (uDest){
912
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914
            }
915
            if (CONFIG_SWSCALE_ALPHA && aDest){
916
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917
            }
918

    
919
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920
        }else{
921
            if (uDest){
922
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924
            }
925
            if (CONFIG_SWSCALE_ALPHA && aDest){
926
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927
            }
928

    
929
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930
        }
931
        return;
932
    }
933
#endif
934
#if HAVE_ALTIVEC
935
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936
                      chrFilter, chrSrc, chrFilterSize,
937
                      dest, uDest, vDest, dstW, chrDstW);
938
#else //HAVE_ALTIVEC
939
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940
            chrFilter, chrSrc, chrFilterSize,
941
            alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942
#endif //!HAVE_ALTIVEC
943
}
944

    
945
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
948
{
949
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950
             chrFilter, chrSrc, chrFilterSize,
951
             dest, uDest, dstW, chrDstW, dstFormat);
952
}
953

    
954
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956
{
957
    int i;
958
#if HAVE_MMX
959
    if(!(c->flags & SWS_BITEXACT)){
960
        long p= 4;
961
        uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964

    
965
        if (c->flags & SWS_ACCURATE_RND){
966
            while(p--){
967
                if (dst[p]){
968
                    __asm__ volatile(
969
                        YSCALEYUV2YV121_ACCURATE
970
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
971
                        "g" (-counter[p])
972
                        : "%"REG_a
973
                    );
974
                }
975
            }
976
        }else{
977
            while(p--){
978
                if (dst[p]){
979
                    __asm__ volatile(
980
                        YSCALEYUV2YV121
981
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
982
                        "g" (-counter[p])
983
                        : "%"REG_a
984
                    );
985
                }
986
            }
987
        }
988
        return;
989
    }
990
#endif
991
    for (i=0; i<dstW; i++)
992
    {
993
        int val= (lumSrc[i]+64)>>7;
994

    
995
        if (val&256){
996
            if (val<0) val=0;
997
            else       val=255;
998
        }
999

    
1000
        dest[i]= val;
1001
    }
1002

    
1003
    if (uDest)
1004
        for (i=0; i<chrDstW; i++)
1005
        {
1006
            int u=(chrSrc[i       ]+64)>>7;
1007
            int v=(chrSrc[i + VOFW]+64)>>7;
1008

    
1009
            if ((u|v)&256){
1010
                if (u<0)        u=0;
1011
                else if (u>255) u=255;
1012
                if (v<0)        v=0;
1013
                else if (v>255) v=255;
1014
            }
1015

    
1016
            uDest[i]= u;
1017
            vDest[i]= v;
1018
        }
1019

    
1020
    if (CONFIG_SWSCALE_ALPHA && aDest)
1021
        for (i=0; i<dstW; i++){
1022
            int val= (alpSrc[i]+64)>>7;
1023
            aDest[i]= av_clip_uint8(val);
1024
        }
1025
}
1026

    
1027

    
1028
/**
1029
 * vertical scale YV12 to RGB
1030
 */
1031
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1034
{
1035
#if HAVE_MMX
1036
    x86_reg dummy=0;
1037
    if(!(c->flags & SWS_BITEXACT)){
1038
        if (c->flags & SWS_ACCURATE_RND){
1039
            switch(c->dstFormat){
1040
            case PIX_FMT_RGB32:
1041
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042
                    YSCALEYUV2PACKEDX_ACCURATE
1043
                    YSCALEYUV2RGBX
1044
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1045
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1046
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1047
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1049
                    "psraw                        $3, %%mm1         \n\t"
1050
                    "psraw                        $3, %%mm7         \n\t"
1051
                    "packuswb                  %%mm7, %%mm1         \n\t"
1052
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1053

    
1054
                    YSCALEYUV2PACKEDX_END
1055
                }else{
1056
                    YSCALEYUV2PACKEDX_ACCURATE
1057
                    YSCALEYUV2RGBX
1058
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1059
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1060

    
1061
                    YSCALEYUV2PACKEDX_END
1062
                }
1063
                return;
1064
            case PIX_FMT_BGR24:
1065
                YSCALEYUV2PACKEDX_ACCURATE
1066
                YSCALEYUV2RGBX
1067
                "pxor %%mm7, %%mm7 \n\t"
1068
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069
                "add %4, %%"REG_c"                        \n\t"
1070
                WRITEBGR24(%%REGc, %5, %%REGa)
1071

    
1072

    
1073
                :: "r" (&c->redDither),
1074
                "m" (dummy), "m" (dummy), "m" (dummy),
1075
                "r" (dest), "m" (dstW)
1076
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077
                );
1078
                return;
1079
            case PIX_FMT_RGB555:
1080
                YSCALEYUV2PACKEDX_ACCURATE
1081
                YSCALEYUV2RGBX
1082
                "pxor %%mm7, %%mm7 \n\t"
1083
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084
#ifdef DITHER1XBPP
1085
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088
#endif
1089

    
1090
                WRITERGB15(%4, %5, %%REGa)
1091
                YSCALEYUV2PACKEDX_END
1092
                return;
1093
            case PIX_FMT_RGB565:
1094
                YSCALEYUV2PACKEDX_ACCURATE
1095
                YSCALEYUV2RGBX
1096
                "pxor %%mm7, %%mm7 \n\t"
1097
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1098
#ifdef DITHER1XBPP
1099
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102
#endif
1103

    
1104
                WRITERGB16(%4, %5, %%REGa)
1105
                YSCALEYUV2PACKEDX_END
1106
                return;
1107
            case PIX_FMT_YUYV422:
1108
                YSCALEYUV2PACKEDX_ACCURATE
1109
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1110

    
1111
                "psraw $3, %%mm3    \n\t"
1112
                "psraw $3, %%mm4    \n\t"
1113
                "psraw $3, %%mm1    \n\t"
1114
                "psraw $3, %%mm7    \n\t"
1115
                WRITEYUY2(%4, %5, %%REGa)
1116
                YSCALEYUV2PACKEDX_END
1117
                return;
1118
            }
1119
        }else{
1120
            switch(c->dstFormat)
1121
            {
1122
            case PIX_FMT_RGB32:
1123
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1124
                    YSCALEYUV2PACKEDX
1125
                    YSCALEYUV2RGBX
1126
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127
                    "psraw                        $3, %%mm1         \n\t"
1128
                    "psraw                        $3, %%mm7         \n\t"
1129
                    "packuswb                  %%mm7, %%mm1         \n\t"
1130
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131
                    YSCALEYUV2PACKEDX_END
1132
                }else{
1133
                    YSCALEYUV2PACKEDX
1134
                    YSCALEYUV2RGBX
1135
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1136
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137
                    YSCALEYUV2PACKEDX_END
1138
                }
1139
                return;
1140
            case PIX_FMT_BGR24:
1141
                YSCALEYUV2PACKEDX
1142
                YSCALEYUV2RGBX
1143
                "pxor                    %%mm7, %%mm7       \n\t"
1144
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1145
                "add                        %4, %%"REG_c"   \n\t"
1146
                WRITEBGR24(%%REGc, %5, %%REGa)
1147

    
1148
                :: "r" (&c->redDither),
1149
                "m" (dummy), "m" (dummy), "m" (dummy),
1150
                "r" (dest),  "m" (dstW)
1151
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1152
                );
1153
                return;
1154
            case PIX_FMT_RGB555:
1155
                YSCALEYUV2PACKEDX
1156
                YSCALEYUV2RGBX
1157
                "pxor %%mm7, %%mm7 \n\t"
1158
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1159
#ifdef DITHER1XBPP
1160
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1161
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1162
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1163
#endif
1164

    
1165
                WRITERGB15(%4, %5, %%REGa)
1166
                YSCALEYUV2PACKEDX_END
1167
                return;
1168
            case PIX_FMT_RGB565:
1169
                YSCALEYUV2PACKEDX
1170
                YSCALEYUV2RGBX
1171
                "pxor %%mm7, %%mm7 \n\t"
1172
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1173
#ifdef DITHER1XBPP
1174
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1175
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1176
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1177
#endif
1178

    
1179
                WRITERGB16(%4, %5, %%REGa)
1180
                YSCALEYUV2PACKEDX_END
1181
                return;
1182
            case PIX_FMT_YUYV422:
1183
                YSCALEYUV2PACKEDX
1184
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1185

    
1186
                "psraw $3, %%mm3    \n\t"
1187
                "psraw $3, %%mm4    \n\t"
1188
                "psraw $3, %%mm1    \n\t"
1189
                "psraw $3, %%mm7    \n\t"
1190
                WRITEYUY2(%4, %5, %%REGa)
1191
                YSCALEYUV2PACKEDX_END
1192
                return;
1193
            }
1194
        }
1195
    }
1196
#endif /* HAVE_MMX */
1197
#if HAVE_ALTIVEC
1198
    /* The following list of supported dstFormat values should
1199
       match what's found in the body of ff_yuv2packedX_altivec() */
1200
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1201
       (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1202
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1203
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1204
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205
                                   chrFilter, chrSrc, chrFilterSize,
1206
                                   dest, dstW, dstY);
1207
    else
1208
#endif
1209
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210
                       chrFilter, chrSrc, chrFilterSize,
1211
                       alpSrc, dest, dstW, dstY);
1212
}
1213

    
1214
/**
1215
 * vertical bilinear scale YV12 to RGB
1216
 */
1217
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1219
{
1220
    int  yalpha1=4095- yalpha;
1221
    int uvalpha1=4095-uvalpha;
1222
    int i;
1223

    
1224
#if HAVE_MMX
1225
    if(!(c->flags & SWS_BITEXACT)){
1226
        switch(c->dstFormat)
1227
        {
1228
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1229
            case PIX_FMT_RGB32:
1230
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1231
#if ARCH_X86_64
1232
                    __asm__ volatile(
1233
                    YSCALEYUV2RGB(%%REGBP, %5)
1234
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237
                    "packuswb            %%mm7, %%mm1       \n\t"
1238
                    WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1239

    
1240
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1241
                    "a" (&c->redDither)
1242
                    ,"r" (abuf0), "r" (abuf1)
1243
                    : "%"REG_BP
1244
                    );
1245
#else
1246
                    *(uint16_t **)(&c->u_temp)=abuf0;
1247
                    *(uint16_t **)(&c->v_temp)=abuf1;
1248
                    __asm__ volatile(
1249
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1250
                    "mov        %4, %%"REG_b"               \n\t"
1251
                    "push %%"REG_BP"                        \n\t"
1252
                    YSCALEYUV2RGB(%%REGBP, %5)
1253
                    "push                   %0              \n\t"
1254
                    "push                   %1              \n\t"
1255
                    "mov          "U_TEMP"(%5), %0          \n\t"
1256
                    "mov          "V_TEMP"(%5), %1          \n\t"
1257
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260
                    "packuswb            %%mm7, %%mm1       \n\t"
1261
                    "pop                    %1              \n\t"
1262
                    "pop                    %0              \n\t"
1263
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264
                    "pop %%"REG_BP"                         \n\t"
1265
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1266

    
1267
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1268
                    "a" (&c->redDither)
1269
                    );
1270
#endif
1271
                }else{
1272
                    __asm__ volatile(
1273
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1274
                    "mov        %4, %%"REG_b"               \n\t"
1275
                    "push %%"REG_BP"                        \n\t"
1276
                    YSCALEYUV2RGB(%%REGBP, %5)
1277
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1278
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279
                    "pop %%"REG_BP"                         \n\t"
1280
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1281

    
1282
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283
                    "a" (&c->redDither)
1284
                    );
1285
                }
1286
                return;
1287
            case PIX_FMT_BGR24:
1288
                __asm__ volatile(
1289
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1290
                "mov        %4, %%"REG_b"               \n\t"
1291
                "push %%"REG_BP"                        \n\t"
1292
                YSCALEYUV2RGB(%%REGBP, %5)
1293
                "pxor    %%mm7, %%mm7                   \n\t"
1294
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295
                "pop %%"REG_BP"                         \n\t"
1296
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1297
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298
                "a" (&c->redDither)
1299
                );
1300
                return;
1301
            case PIX_FMT_RGB555:
1302
                __asm__ volatile(
1303
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1304
                "mov        %4, %%"REG_b"               \n\t"
1305
                "push %%"REG_BP"                        \n\t"
1306
                YSCALEYUV2RGB(%%REGBP, %5)
1307
                "pxor    %%mm7, %%mm7                   \n\t"
1308
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309
#ifdef DITHER1XBPP
1310
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1311
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1312
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1313
#endif
1314

    
1315
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1316
                "pop %%"REG_BP"                         \n\t"
1317
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1318

    
1319
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1320
                "a" (&c->redDither)
1321
                );
1322
                return;
1323
            case PIX_FMT_RGB565:
1324
                __asm__ volatile(
1325
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1326
                "mov        %4, %%"REG_b"               \n\t"
1327
                "push %%"REG_BP"                        \n\t"
1328
                YSCALEYUV2RGB(%%REGBP, %5)
1329
                "pxor    %%mm7, %%mm7                   \n\t"
1330
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1331
#ifdef DITHER1XBPP
1332
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1333
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1334
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1335
#endif
1336

    
1337
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1338
                "pop %%"REG_BP"                         \n\t"
1339
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1340
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1341
                "a" (&c->redDither)
1342
                );
1343
                return;
1344
            case PIX_FMT_YUYV422:
1345
                __asm__ volatile(
1346
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1347
                "mov %4, %%"REG_b"                        \n\t"
1348
                "push %%"REG_BP"                        \n\t"
1349
                YSCALEYUV2PACKED(%%REGBP, %5)
1350
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351
                "pop %%"REG_BP"                         \n\t"
1352
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1353
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354
                "a" (&c->redDither)
1355
                );
1356
                return;
1357
            default: break;
1358
        }
1359
    }
1360
#endif //HAVE_MMX
1361
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1362
}
1363

    
1364
/**
1365
 * YV12 to RGB without scaling or interpolating
1366
 */
1367
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1369
{
1370
    const int yalpha1=0;
1371
    int i;
1372

    
1373
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1374
    const int yalpha= 4096; //FIXME ...
1375

    
1376
    if (flags&SWS_FULL_CHR_H_INT)
1377
    {
1378
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1379
        return;
1380
    }
1381

    
1382
#if HAVE_MMX
1383
    if(!(flags & SWS_BITEXACT)){
1384
        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1385
        {
1386
            switch(dstFormat)
1387
            {
1388
            case PIX_FMT_RGB32:
1389
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1390
                    __asm__ volatile(
1391
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1392
                    "mov        %4, %%"REG_b"               \n\t"
1393
                    "push %%"REG_BP"                        \n\t"
1394
                    YSCALEYUV2RGB1(%%REGBP, %5)
1395
                    YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397
                    "pop %%"REG_BP"                         \n\t"
1398
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1399

    
1400
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401
                    "a" (&c->redDither)
1402
                    );
1403
                }else{
1404
                    __asm__ volatile(
1405
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1406
                    "mov        %4, %%"REG_b"               \n\t"
1407
                    "push %%"REG_BP"                        \n\t"
1408
                    YSCALEYUV2RGB1(%%REGBP, %5)
1409
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1410
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411
                    "pop %%"REG_BP"                         \n\t"
1412
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1413

    
1414
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1415
                    "a" (&c->redDither)
1416
                    );
1417
                }
1418
                return;
1419
            case PIX_FMT_BGR24:
1420
                __asm__ volatile(
1421
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1422
                "mov        %4, %%"REG_b"               \n\t"
1423
                "push %%"REG_BP"                        \n\t"
1424
                YSCALEYUV2RGB1(%%REGBP, %5)
1425
                "pxor    %%mm7, %%mm7                   \n\t"
1426
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427
                "pop %%"REG_BP"                         \n\t"
1428
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1429

    
1430
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431
                "a" (&c->redDither)
1432
                );
1433
                return;
1434
            case PIX_FMT_RGB555:
1435
                __asm__ volatile(
1436
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1437
                "mov        %4, %%"REG_b"               \n\t"
1438
                "push %%"REG_BP"                        \n\t"
1439
                YSCALEYUV2RGB1(%%REGBP, %5)
1440
                "pxor    %%mm7, %%mm7                   \n\t"
1441
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1442
#ifdef DITHER1XBPP
1443
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1444
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1445
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1446
#endif
1447
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448
                "pop %%"REG_BP"                         \n\t"
1449
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1450

    
1451
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452
                "a" (&c->redDither)
1453
                );
1454
                return;
1455
            case PIX_FMT_RGB565:
1456
                __asm__ volatile(
1457
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1458
                "mov        %4, %%"REG_b"               \n\t"
1459
                "push %%"REG_BP"                        \n\t"
1460
                YSCALEYUV2RGB1(%%REGBP, %5)
1461
                "pxor    %%mm7, %%mm7                   \n\t"
1462
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1463
#ifdef DITHER1XBPP
1464
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1465
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1466
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1467
#endif
1468

    
1469
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470
                "pop %%"REG_BP"                         \n\t"
1471
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1472

    
1473
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474
                "a" (&c->redDither)
1475
                );
1476
                return;
1477
            case PIX_FMT_YUYV422:
1478
                __asm__ volatile(
1479
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1480
                "mov        %4, %%"REG_b"               \n\t"
1481
                "push %%"REG_BP"                        \n\t"
1482
                YSCALEYUV2PACKED1(%%REGBP, %5)
1483
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484
                "pop %%"REG_BP"                         \n\t"
1485
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1486

    
1487
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488
                "a" (&c->redDither)
1489
                );
1490
                return;
1491
            }
1492
        }
1493
        else
1494
        {
1495
            switch(dstFormat)
1496
            {
1497
            case PIX_FMT_RGB32:
1498
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1499
                    __asm__ volatile(
1500
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                    "mov        %4, %%"REG_b"               \n\t"
1502
                    "push %%"REG_BP"                        \n\t"
1503
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1504
                    YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506
                    "pop %%"REG_BP"                         \n\t"
1507
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1508

    
1509
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510
                    "a" (&c->redDither)
1511
                    );
1512
                }else{
1513
                    __asm__ volatile(
1514
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515
                    "mov        %4, %%"REG_b"               \n\t"
1516
                    "push %%"REG_BP"                        \n\t"
1517
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1518
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1519
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520
                    "pop %%"REG_BP"                         \n\t"
1521
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522

    
1523
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524
                    "a" (&c->redDither)
1525
                    );
1526
                }
1527
                return;
1528
            case PIX_FMT_BGR24:
1529
                __asm__ volatile(
1530
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1531
                "mov        %4, %%"REG_b"               \n\t"
1532
                "push %%"REG_BP"                        \n\t"
1533
                YSCALEYUV2RGB1b(%%REGBP, %5)
1534
                "pxor    %%mm7, %%mm7                   \n\t"
1535
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536
                "pop %%"REG_BP"                         \n\t"
1537
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1538

    
1539
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540
                "a" (&c->redDither)
1541
                );
1542
                return;
1543
            case PIX_FMT_RGB555:
1544
                __asm__ volatile(
1545
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1546
                "mov        %4, %%"REG_b"               \n\t"
1547
                "push %%"REG_BP"                        \n\t"
1548
                YSCALEYUV2RGB1b(%%REGBP, %5)
1549
                "pxor    %%mm7, %%mm7                   \n\t"
1550
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1551
#ifdef DITHER1XBPP
1552
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1553
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1554
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1555
#endif
1556
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557
                "pop %%"REG_BP"                         \n\t"
1558
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1559

    
1560
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561
                "a" (&c->redDither)
1562
                );
1563
                return;
1564
            case PIX_FMT_RGB565:
1565
                __asm__ volatile(
1566
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1567
                "mov        %4, %%"REG_b"               \n\t"
1568
                "push %%"REG_BP"                        \n\t"
1569
                YSCALEYUV2RGB1b(%%REGBP, %5)
1570
                "pxor    %%mm7, %%mm7                   \n\t"
1571
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1572
#ifdef DITHER1XBPP
1573
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1574
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1575
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1576
#endif
1577

    
1578
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579
                "pop %%"REG_BP"                         \n\t"
1580
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1581

    
1582
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583
                "a" (&c->redDither)
1584
                );
1585
                return;
1586
            case PIX_FMT_YUYV422:
1587
                __asm__ volatile(
1588
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1589
                "mov        %4, %%"REG_b"               \n\t"
1590
                "push %%"REG_BP"                        \n\t"
1591
                YSCALEYUV2PACKED1b(%%REGBP, %5)
1592
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593
                "pop %%"REG_BP"                         \n\t"
1594
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1595

    
1596
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1597
                "a" (&c->redDither)
1598
                );
1599
                return;
1600
            }
1601
        }
1602
    }
1603
#endif /* HAVE_MMX */
1604
    if (uvalpha < 2048)
1605
    {
1606
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1607
    }else{
1608
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1609
    }
1610
}
1611

    
1612
//FIXME yuy2* can read up to 7 samples too much
1613

    
1614
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1615
{
1616
#if HAVE_MMX
1617
    __asm__ volatile(
1618
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1619
    "mov                    %0, %%"REG_a"       \n\t"
1620
    "1:                                         \n\t"
1621
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1622
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1623
    "pand                %%mm2, %%mm0           \n\t"
1624
    "pand                %%mm2, %%mm1           \n\t"
1625
    "packuswb            %%mm1, %%mm0           \n\t"
1626
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1627
    "add                    $8, %%"REG_a"       \n\t"
1628
    " js                    1b                  \n\t"
1629
    : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1630
    : "%"REG_a
1631
    );
1632
#else
1633
    int i;
1634
    for (i=0; i<width; i++)
1635
        dst[i]= src[2*i];
1636
#endif
1637
}
1638

    
1639
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1640
{
1641
#if HAVE_MMX
1642
    __asm__ volatile(
1643
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1644
    "mov                    %0, %%"REG_a"       \n\t"
1645
    "1:                                         \n\t"
1646
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1647
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1648
    "psrlw                  $8, %%mm0           \n\t"
1649
    "psrlw                  $8, %%mm1           \n\t"
1650
    "packuswb            %%mm1, %%mm0           \n\t"
1651
    "movq                %%mm0, %%mm1           \n\t"
1652
    "psrlw                  $8, %%mm0           \n\t"
1653
    "pand                %%mm4, %%mm1           \n\t"
1654
    "packuswb            %%mm0, %%mm0           \n\t"
1655
    "packuswb            %%mm1, %%mm1           \n\t"
1656
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1657
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1658
    "add                    $4, %%"REG_a"       \n\t"
1659
    " js                    1b                  \n\t"
1660
    : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1661
    : "%"REG_a
1662
    );
1663
#else
1664
    int i;
1665
    for (i=0; i<width; i++)
1666
    {
1667
        dstU[i]= src1[4*i + 1];
1668
        dstV[i]= src1[4*i + 3];
1669
    }
1670
#endif
1671
    assert(src1 == src2);
1672
}
1673

    
1674
/* This is almost identical to the previous, end exists only because
1675
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1676
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1677
{
1678
#if HAVE_MMX
1679
    __asm__ volatile(
1680
    "mov                  %0, %%"REG_a"         \n\t"
1681
    "1:                                         \n\t"
1682
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1683
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1684
    "psrlw                $8, %%mm0             \n\t"
1685
    "psrlw                $8, %%mm1             \n\t"
1686
    "packuswb          %%mm1, %%mm0             \n\t"
1687
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1688
    "add                  $8, %%"REG_a"         \n\t"
1689
    " js                  1b                    \n\t"
1690
    : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1691
    : "%"REG_a
1692
    );
1693
#else
1694
    int i;
1695
    for (i=0; i<width; i++)
1696
        dst[i]= src[2*i+1];
1697
#endif
1698
}
1699

    
1700
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1701
{
1702
#if HAVE_MMX
1703
    __asm__ volatile(
1704
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1705
    "mov                    %0, %%"REG_a"       \n\t"
1706
    "1:                                         \n\t"
1707
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1708
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1709
    "pand                %%mm4, %%mm0           \n\t"
1710
    "pand                %%mm4, %%mm1           \n\t"
1711
    "packuswb            %%mm1, %%mm0           \n\t"
1712
    "movq                %%mm0, %%mm1           \n\t"
1713
    "psrlw                  $8, %%mm0           \n\t"
1714
    "pand                %%mm4, %%mm1           \n\t"
1715
    "packuswb            %%mm0, %%mm0           \n\t"
1716
    "packuswb            %%mm1, %%mm1           \n\t"
1717
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1718
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1719
    "add                    $4, %%"REG_a"       \n\t"
1720
    " js                    1b                  \n\t"
1721
    : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1722
    : "%"REG_a
1723
    );
1724
#else
1725
    int i;
1726
    for (i=0; i<width; i++)
1727
    {
1728
        dstU[i]= src1[4*i + 0];
1729
        dstV[i]= src1[4*i + 2];
1730
    }
1731
#endif
1732
    assert(src1 == src2);
1733
}
1734

    
1735
#define BGR2Y(type, name, shr, shg, shb, maskr, maskg, maskb, RY, GY, BY, S)\
1736
static inline void RENAME(name)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)\
1737
{\
1738
    int i;\
1739
    for (i=0; i<width; i++)\
1740
    {\
1741
        int b= (((const type*)src)[i]>>shb)&maskb;\
1742
        int g= (((const type*)src)[i]>>shg)&maskg;\
1743
        int r= (((const type*)src)[i]>>shr)&maskr;\
1744
\
1745
        dst[i]= (((RY)*r + (GY)*g + (BY)*b + (33<<((S)-1)))>>(S));\
1746
    }\
1747
}
1748

    
1749
BGR2Y(uint32_t, bgr32ToY,16, 0, 0, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1750
BGR2Y(uint32_t, rgb32ToY, 0, 0,16, 0x00FF, 0xFF00, 0x00FF, RY<< 8, GY   , BY<< 8, RGB2YUV_SHIFT+8)
1751
BGR2Y(uint16_t, bgr16ToY, 0, 0, 0, 0x001F, 0x07E0, 0xF800, RY<<11, GY<<5, BY    , RGB2YUV_SHIFT+8)
1752
BGR2Y(uint16_t, bgr15ToY, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, RY<<10, GY<<5, BY    , RGB2YUV_SHIFT+7)
1753
BGR2Y(uint16_t, rgb16ToY, 0, 0, 0, 0xF800, 0x07E0, 0x001F, RY    , GY<<5, BY<<11, RGB2YUV_SHIFT+8)
1754
BGR2Y(uint16_t, rgb15ToY, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, RY    , GY<<5, BY<<10, RGB2YUV_SHIFT+7)
1755

    
1756
static inline void RENAME(abgrToA)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused){
1757
    int i;
1758
    for (i=0; i<width; i++){
1759
        dst[i]= src[4*i];
1760
    }
1761
}
1762

    
1763
#define BGR2UV(type, name, shr, shg, shb, maska, maskr, maskg, maskb, RU, GU, BU, RV, GV, BV, S)\
1764
static inline void RENAME(name)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
1765
{\
1766
    int i;\
1767
    for (i=0; i<width; i++)\
1768
    {\
1769
        int b= (((const type*)src)[i]&maskb)>>shb;\
1770
        int g= (((const type*)src)[i]&maskg)>>shg;\
1771
        int r= (((const type*)src)[i]&maskr)>>shr;\
1772
\
1773
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<((S)-1)))>>(S);\
1774
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<((S)-1)))>>(S);\
1775
    }\
1776
}\
1777
static inline void RENAME(name ## _half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, const uint8_t *dummy, long width, uint32_t *unused)\
1778
{\
1779
    int i;\
1780
    for (i=0; i<width; i++)\
1781
    {\
1782
        int pix0= ((const type*)src)[2*i+0];\
1783
        int pix1= ((const type*)src)[2*i+1];\
1784
        int g= (pix0&~(maskr|maskb))+(pix1&~(maskr|maskb));\
1785
        int b= ((pix0+pix1-g)&(maskb|(2*maskb)))>>shb;\
1786
        int r= ((pix0+pix1-g)&(maskr|(2*maskr)))>>shr;\
1787
        g&= maskg|(2*maskg);\
1788
\
1789
        g>>=shg;\
1790
\
1791
        dstU[i]= ((RU)*r + (GU)*g + (BU)*b + (257<<(S)))>>((S)+1);\
1792
        dstV[i]= ((RV)*r + (GV)*g + (BV)*b + (257<<(S)))>>((S)+1);\
1793
    }\
1794
}
1795

    
1796
BGR2UV(uint32_t, bgr32ToUV,16, 0, 0, 0xFF000000, 0xFF0000, 0xFF00,   0x00FF, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1797
BGR2UV(uint32_t, rgb32ToUV, 0, 0,16, 0xFF000000,   0x00FF, 0xFF00, 0xFF0000, RU<< 8, GU   , BU<< 8, RV<< 8, GV   , BV<< 8, RGB2YUV_SHIFT+8)
1798
BGR2UV(uint16_t, bgr16ToUV, 0, 0, 0,          0,   0x001F, 0x07E0,   0xF800, RU<<11, GU<<5, BU    , RV<<11, GV<<5, BV    , RGB2YUV_SHIFT+8)
1799
BGR2UV(uint16_t, bgr15ToUV, 0, 0, 0,          0,   0x001F, 0x03E0,   0x7C00, RU<<10, GU<<5, BU    , RV<<10, GV<<5, BV    , RGB2YUV_SHIFT+7)
1800
BGR2UV(uint16_t, rgb16ToUV, 0, 0, 0,          0,   0xF800, 0x07E0,   0x001F, RU    , GU<<5, BU<<11, RV    , GV<<5, BV<<11, RGB2YUV_SHIFT+8)
1801
BGR2UV(uint16_t, rgb15ToUV, 0, 0, 0,          0,   0x7C00, 0x03E0,   0x001F, RU    , GU<<5, BU<<10, RV    , GV<<5, BV<<10, RGB2YUV_SHIFT+7)
1802

    
1803
#if HAVE_MMX
1804
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1805
{
1806

    
1807
    if(srcFormat == PIX_FMT_BGR24){
1808
        __asm__ volatile(
1809
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1810
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1811
            :
1812
        );
1813
    }else{
1814
        __asm__ volatile(
1815
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1816
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1817
            :
1818
        );
1819
    }
1820

    
1821
    __asm__ volatile(
1822
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1823
        "mov                        %2, %%"REG_a"   \n\t"
1824
        "pxor                    %%mm7, %%mm7       \n\t"
1825
        "1:                                         \n\t"
1826
        PREFETCH"               64(%0)              \n\t"
1827
        "movd                     (%0), %%mm0       \n\t"
1828
        "movd                    2(%0), %%mm1       \n\t"
1829
        "movd                    6(%0), %%mm2       \n\t"
1830
        "movd                    8(%0), %%mm3       \n\t"
1831
        "add                       $12, %0          \n\t"
1832
        "punpcklbw               %%mm7, %%mm0       \n\t"
1833
        "punpcklbw               %%mm7, %%mm1       \n\t"
1834
        "punpcklbw               %%mm7, %%mm2       \n\t"
1835
        "punpcklbw               %%mm7, %%mm3       \n\t"
1836
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1837
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1838
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1839
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1840
        "paddd                   %%mm1, %%mm0       \n\t"
1841
        "paddd                   %%mm3, %%mm2       \n\t"
1842
        "paddd                   %%mm4, %%mm0       \n\t"
1843
        "paddd                   %%mm4, %%mm2       \n\t"
1844
        "psrad                     $15, %%mm0       \n\t"
1845
        "psrad                     $15, %%mm2       \n\t"
1846
        "packssdw                %%mm2, %%mm0       \n\t"
1847
        "packuswb                %%mm0, %%mm0       \n\t"
1848
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1849
        "add                        $4, %%"REG_a"   \n\t"
1850
        " js                        1b              \n\t"
1851
    : "+r" (src)
1852
    : "r" (dst+width), "g" ((x86_reg)-width)
1853
    : "%"REG_a
1854
    );
1855
}
1856

    
1857
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1858
{
1859
    __asm__ volatile(
1860
        "movq                    24+%4, %%mm6       \n\t"
1861
        "mov                        %3, %%"REG_a"   \n\t"
1862
        "pxor                    %%mm7, %%mm7       \n\t"
1863
        "1:                                         \n\t"
1864
        PREFETCH"               64(%0)              \n\t"
1865
        "movd                     (%0), %%mm0       \n\t"
1866
        "movd                    2(%0), %%mm1       \n\t"
1867
        "punpcklbw               %%mm7, %%mm0       \n\t"
1868
        "punpcklbw               %%mm7, %%mm1       \n\t"
1869
        "movq                    %%mm0, %%mm2       \n\t"
1870
        "movq                    %%mm1, %%mm3       \n\t"
1871
        "pmaddwd                    %4, %%mm0       \n\t"
1872
        "pmaddwd                  8+%4, %%mm1       \n\t"
1873
        "pmaddwd                 16+%4, %%mm2       \n\t"
1874
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1875
        "paddd                   %%mm1, %%mm0       \n\t"
1876
        "paddd                   %%mm3, %%mm2       \n\t"
1877

    
1878
        "movd                    6(%0), %%mm1       \n\t"
1879
        "movd                    8(%0), %%mm3       \n\t"
1880
        "add                       $12, %0          \n\t"
1881
        "punpcklbw               %%mm7, %%mm1       \n\t"
1882
        "punpcklbw               %%mm7, %%mm3       \n\t"
1883
        "movq                    %%mm1, %%mm4       \n\t"
1884
        "movq                    %%mm3, %%mm5       \n\t"
1885
        "pmaddwd                    %4, %%mm1       \n\t"
1886
        "pmaddwd                  8+%4, %%mm3       \n\t"
1887
        "pmaddwd                 16+%4, %%mm4       \n\t"
1888
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1889
        "paddd                   %%mm3, %%mm1       \n\t"
1890
        "paddd                   %%mm5, %%mm4       \n\t"
1891

    
1892
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1893
        "paddd                   %%mm3, %%mm0       \n\t"
1894
        "paddd                   %%mm3, %%mm2       \n\t"
1895
        "paddd                   %%mm3, %%mm1       \n\t"
1896
        "paddd                   %%mm3, %%mm4       \n\t"
1897
        "psrad                     $15, %%mm0       \n\t"
1898
        "psrad                     $15, %%mm2       \n\t"
1899
        "psrad                     $15, %%mm1       \n\t"
1900
        "psrad                     $15, %%mm4       \n\t"
1901
        "packssdw                %%mm1, %%mm0       \n\t"
1902
        "packssdw                %%mm4, %%mm2       \n\t"
1903
        "packuswb                %%mm0, %%mm0       \n\t"
1904
        "packuswb                %%mm2, %%mm2       \n\t"
1905
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1906
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1907
        "add                        $4, %%"REG_a"   \n\t"
1908
        " js                        1b              \n\t"
1909
    : "+r" (src)
1910
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1911
    : "%"REG_a
1912
    );
1913
}
1914
#endif
1915

    
1916
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1917
{
1918
#if HAVE_MMX
1919
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1920
#else
1921
    int i;
1922
    for (i=0; i<width; i++)
1923
    {
1924
        int b= src[i*3+0];
1925
        int g= src[i*3+1];
1926
        int r= src[i*3+2];
1927

    
1928
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1929
    }
1930
#endif /* HAVE_MMX */
1931
}
1932

    
1933
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1934
{
1935
#if HAVE_MMX
1936
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1937
#else
1938
    int i;
1939
    for (i=0; i<width; i++)
1940
    {
1941
        int b= src1[3*i + 0];
1942
        int g= src1[3*i + 1];
1943
        int r= src1[3*i + 2];
1944

    
1945
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1946
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1947
    }
1948
#endif /* HAVE_MMX */
1949
    assert(src1 == src2);
1950
}
1951

    
1952
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1953
{
1954
    int i;
1955
    for (i=0; i<width; i++)
1956
    {
1957
        int b= src1[6*i + 0] + src1[6*i + 3];
1958
        int g= src1[6*i + 1] + src1[6*i + 4];
1959
        int r= src1[6*i + 2] + src1[6*i + 5];
1960

    
1961
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1962
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1963
    }
1964
    assert(src1 == src2);
1965
}
1966

    
1967
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1968
{
1969
#if HAVE_MMX
1970
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1971
#else
1972
    int i;
1973
    for (i=0; i<width; i++)
1974
    {
1975
        int r= src[i*3+0];
1976
        int g= src[i*3+1];
1977
        int b= src[i*3+2];
1978

    
1979
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1980
    }
1981
#endif
1982
}
1983

    
1984
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1985
{
1986
#if HAVE_MMX
1987
    assert(src1==src2);
1988
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1989
#else
1990
    int i;
1991
    assert(src1==src2);
1992
    for (i=0; i<width; i++)
1993
    {
1994
        int r= src1[3*i + 0];
1995
        int g= src1[3*i + 1];
1996
        int b= src1[3*i + 2];
1997

    
1998
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1999
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2000
    }
2001
#endif
2002
}
2003

    
2004
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2005
{
2006
    int i;
2007
    assert(src1==src2);
2008
    for (i=0; i<width; i++)
2009
    {
2010
        int r= src1[6*i + 0] + src1[6*i + 3];
2011
        int g= src1[6*i + 1] + src1[6*i + 4];
2012
        int b= src1[6*i + 2] + src1[6*i + 5];
2013

    
2014
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2015
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2016
    }
2017
}
2018

    
2019

    
2020
static inline void RENAME(palToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *pal)
2021
{
2022
    int i;
2023
    for (i=0; i<width; i++)
2024
    {
2025
        int d= src[i];
2026

    
2027
        dst[i]= pal[d] & 0xFF;
2028
    }
2029
}
2030

    
2031
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV,
2032
                                   const uint8_t *src1, const uint8_t *src2,
2033
                                   long width, uint32_t *pal)
2034
{
2035
    int i;
2036
    assert(src1 == src2);
2037
    for (i=0; i<width; i++)
2038
    {
2039
        int p= pal[src1[i]];
2040

    
2041
        dstU[i]= p>>8;
2042
        dstV[i]= p>>16;
2043
    }
2044
}
2045

    
2046
static inline void RENAME(monowhite2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2047
{
2048
    int i, j;
2049
    for (i=0; i<width/8; i++){
2050
        int d= ~src[i];
2051
        for(j=0; j<8; j++)
2052
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2053
    }
2054
}
2055

    
2056
static inline void RENAME(monoblack2Y)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
2057
{
2058
    int i, j;
2059
    for (i=0; i<width/8; i++){
2060
        int d= src[i];
2061
        for(j=0; j<8; j++)
2062
            dst[8*i+j]= ((d>>(7-j))&1)*255;
2063
    }
2064
}
2065

    
2066
// bilinear / bicubic scaling
2067
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2068
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2069
{
2070
#if HAVE_MMX
2071
    assert(filterSize % 4 == 0 && filterSize>0);
2072
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2073
    {
2074
        x86_reg counter= -2*dstW;
2075
        filter-= counter*2;
2076
        filterPos-= counter/2;
2077
        dst-= counter/2;
2078
        __asm__ volatile(
2079
#if defined(PIC)
2080
        "push            %%"REG_b"              \n\t"
2081
#endif
2082
        "pxor                %%mm7, %%mm7       \n\t"
2083
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2084
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2085
        ASMALIGN(4)
2086
        "1:                                     \n\t"
2087
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2088
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2089
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2090
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2091
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2092
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2093
        "punpcklbw           %%mm7, %%mm0       \n\t"
2094
        "punpcklbw           %%mm7, %%mm2       \n\t"
2095
        "pmaddwd             %%mm1, %%mm0       \n\t"
2096
        "pmaddwd             %%mm2, %%mm3       \n\t"
2097
        "movq                %%mm0, %%mm4       \n\t"
2098
        "punpckldq           %%mm3, %%mm0       \n\t"
2099
        "punpckhdq           %%mm3, %%mm4       \n\t"
2100
        "paddd               %%mm4, %%mm0       \n\t"
2101
        "psrad                  $7, %%mm0       \n\t"
2102
        "packssdw            %%mm0, %%mm0       \n\t"
2103
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2104
        "add                    $4, %%"REG_BP"  \n\t"
2105
        " jnc                   1b              \n\t"
2106

    
2107
        "pop            %%"REG_BP"              \n\t"
2108
#if defined(PIC)
2109
        "pop             %%"REG_b"              \n\t"
2110
#endif
2111
        : "+a" (counter)
2112
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2113
#if !defined(PIC)
2114
        : "%"REG_b
2115
#endif
2116
        );
2117
    }
2118
    else if (filterSize==8)
2119
    {
2120
        x86_reg counter= -2*dstW;
2121
        filter-= counter*4;
2122
        filterPos-= counter/2;
2123
        dst-= counter/2;
2124
        __asm__ volatile(
2125
#if defined(PIC)
2126
        "push             %%"REG_b"             \n\t"
2127
#endif
2128
        "pxor                 %%mm7, %%mm7      \n\t"
2129
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2130
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2131
        ASMALIGN(4)
2132
        "1:                                     \n\t"
2133
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2134
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2135
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2136
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2137
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2138
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2139
        "punpcklbw            %%mm7, %%mm0      \n\t"
2140
        "punpcklbw            %%mm7, %%mm2      \n\t"
2141
        "pmaddwd              %%mm1, %%mm0      \n\t"
2142
        "pmaddwd              %%mm2, %%mm3      \n\t"
2143

    
2144
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2145
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2146
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2147
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2148
        "punpcklbw            %%mm7, %%mm4      \n\t"
2149
        "punpcklbw            %%mm7, %%mm2      \n\t"
2150
        "pmaddwd              %%mm1, %%mm4      \n\t"
2151
        "pmaddwd              %%mm2, %%mm5      \n\t"
2152
        "paddd                %%mm4, %%mm0      \n\t"
2153
        "paddd                %%mm5, %%mm3      \n\t"
2154
        "movq                 %%mm0, %%mm4      \n\t"
2155
        "punpckldq            %%mm3, %%mm0      \n\t"
2156
        "punpckhdq            %%mm3, %%mm4      \n\t"
2157
        "paddd                %%mm4, %%mm0      \n\t"
2158
        "psrad                   $7, %%mm0      \n\t"
2159
        "packssdw             %%mm0, %%mm0      \n\t"
2160
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2161
        "add                     $4, %%"REG_BP" \n\t"
2162
        " jnc                    1b             \n\t"
2163

    
2164
        "pop             %%"REG_BP"             \n\t"
2165
#if defined(PIC)
2166
        "pop              %%"REG_b"             \n\t"
2167
#endif
2168
        : "+a" (counter)
2169
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2170
#if !defined(PIC)
2171
        : "%"REG_b
2172
#endif
2173
        );
2174
    }
2175
    else
2176
    {
2177
        uint8_t *offset = src+filterSize;
2178
        x86_reg counter= -2*dstW;
2179
        //filter-= counter*filterSize/2;
2180
        filterPos-= counter/2;
2181
        dst-= counter/2;
2182
        __asm__ volatile(
2183
        "pxor                  %%mm7, %%mm7     \n\t"
2184
        ASMALIGN(4)
2185
        "1:                                     \n\t"
2186
        "mov                      %2, %%"REG_c" \n\t"
2187
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2188
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2189
        "mov                      %5, %%"REG_c" \n\t"
2190
        "pxor                  %%mm4, %%mm4     \n\t"
2191
        "pxor                  %%mm5, %%mm5     \n\t"
2192
        "2:                                     \n\t"
2193
        "movq                   (%1), %%mm1     \n\t"
2194
        "movq               (%1, %6), %%mm3     \n\t"
2195
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2196
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2197
        "punpcklbw             %%mm7, %%mm0     \n\t"
2198
        "punpcklbw             %%mm7, %%mm2     \n\t"
2199
        "pmaddwd               %%mm1, %%mm0     \n\t"
2200
        "pmaddwd               %%mm2, %%mm3     \n\t"
2201
        "paddd                 %%mm3, %%mm5     \n\t"
2202
        "paddd                 %%mm0, %%mm4     \n\t"
2203
        "add                      $8, %1        \n\t"
2204
        "add                      $4, %%"REG_c" \n\t"
2205
        "cmp                      %4, %%"REG_c" \n\t"
2206
        " jb                      2b            \n\t"
2207
        "add                      %6, %1        \n\t"
2208
        "movq                  %%mm4, %%mm0     \n\t"
2209
        "punpckldq             %%mm5, %%mm4     \n\t"
2210
        "punpckhdq             %%mm5, %%mm0     \n\t"
2211
        "paddd                 %%mm0, %%mm4     \n\t"
2212
        "psrad                    $7, %%mm4     \n\t"
2213
        "packssdw              %%mm4, %%mm4     \n\t"
2214
        "mov                      %3, %%"REG_a" \n\t"
2215
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2216
        "add                      $4, %0        \n\t"
2217
        " jnc                     1b            \n\t"
2218

    
2219
        : "+r" (counter), "+r" (filter)
2220
        : "m" (filterPos), "m" (dst), "m"(offset),
2221
          "m" (src), "r" ((x86_reg)filterSize*2)
2222
        : "%"REG_a, "%"REG_c, "%"REG_d
2223
        );
2224
    }
2225
#else
2226
#if HAVE_ALTIVEC
2227
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2228
#else
2229
    int i;
2230
    for (i=0; i<dstW; i++)
2231
    {
2232
        int j;
2233
        int srcPos= filterPos[i];
2234
        int val=0;
2235
        //printf("filterPos: %d\n", filterPos[i]);
2236
        for (j=0; j<filterSize; j++)
2237
        {
2238
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2239
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2240
        }
2241
        //filter += hFilterSize;
2242
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2243
        //dst[i] = val>>7;
2244
    }
2245
#endif /* HAVE_ALTIVEC */
2246
#endif /* HAVE_MMX */
2247
}
2248

    
2249
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2250
                                        int dstWidth, const uint8_t *src, int srcW,
2251
                                        int xInc)
2252
{
2253
    int i;
2254
    unsigned int xpos=0;
2255
    for (i=0;i<dstWidth;i++)
2256
    {
2257
        register unsigned int xx=xpos>>16;
2258
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2259
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2260
        xpos+=xInc;
2261
    }
2262
}
2263

    
2264
      // *** horizontal scale Y line to temp buffer
2265
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2266
                                   int flags, const int16_t *hLumFilter,
2267
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2268
                                   int srcFormat, uint8_t *formatConvBuffer,
2269
                                   uint32_t *pal, int isAlpha)
2270
{
2271
    int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
2272
    int16_t *mmx2Filter = c->lumMmx2Filter;
2273
    int canMMX2BeUsed = c->canMMX2BeUsed;
2274
    void *funnyYCode = c->funnyYCode;
2275

    
2276
    if (isAlpha) {
2277
        if (srcFormat == PIX_FMT_RGB32   || srcFormat == PIX_FMT_BGR32  )
2278
            src += 3;
2279
    } else {
2280
        if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2281
            src += ALT32_CORR;
2282
    }
2283

    
2284
    if (c->hyscale_internal) {
2285
        c->hyscale_internal(formatConvBuffer, src, srcW, pal);
2286
        src= formatConvBuffer;
2287
    }
2288

    
2289
#if HAVE_MMX
2290
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2291
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2292
#else
2293
    if (!(flags&SWS_FAST_BILINEAR))
2294
#endif
2295
    {
2296
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2297
    }
2298
    else // fast bilinear upscale / crap downscale
2299
    {
2300
#if ARCH_X86 && CONFIG_GPL
2301
#if HAVE_MMX2
2302
        int i;
2303
#if defined(PIC)
2304
        uint64_t ebxsave __attribute__((aligned(8)));
2305
#endif
2306
        if (canMMX2BeUsed)
2307
        {
2308
            __asm__ volatile(
2309
#if defined(PIC)
2310
            "mov               %%"REG_b", %5        \n\t"
2311
#endif
2312
            "pxor                  %%mm7, %%mm7     \n\t"
2313
            "mov                      %0, %%"REG_c" \n\t"
2314
            "mov                      %1, %%"REG_D" \n\t"
2315
            "mov                      %2, %%"REG_d" \n\t"
2316
            "mov                      %3, %%"REG_b" \n\t"
2317
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2318
            PREFETCH"        (%%"REG_c")            \n\t"
2319
            PREFETCH"      32(%%"REG_c")            \n\t"
2320
            PREFETCH"      64(%%"REG_c")            \n\t"
2321

    
2322
#if ARCH_X86_64
2323

    
2324
#define FUNNY_Y_CODE \
2325
            "movl            (%%"REG_b"), %%esi     \n\t"\
2326
            "call                    *%4            \n\t"\
2327
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2328
            "add               %%"REG_S", %%"REG_c" \n\t"\
2329
            "add               %%"REG_a", %%"REG_D" \n\t"\
2330
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2331

    
2332
#else
2333

    
2334
#define FUNNY_Y_CODE \
2335
            "movl (%%"REG_b"), %%esi        \n\t"\
2336
            "call         *%4                       \n\t"\
2337
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2338
            "add               %%"REG_a", %%"REG_D" \n\t"\
2339
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2340

    
2341
#endif /* ARCH_X86_64 */
2342

    
2343
FUNNY_Y_CODE
2344
FUNNY_Y_CODE
2345
FUNNY_Y_CODE
2346
FUNNY_Y_CODE
2347
FUNNY_Y_CODE
2348
FUNNY_Y_CODE
2349
FUNNY_Y_CODE
2350
FUNNY_Y_CODE
2351

    
2352
#if defined(PIC)
2353
            "mov                      %5, %%"REG_b" \n\t"
2354
#endif
2355
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2356
            "m" (funnyYCode)
2357
#if defined(PIC)
2358
            ,"m" (ebxsave)
2359
#endif
2360
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2361
#if !defined(PIC)
2362
            ,"%"REG_b
2363
#endif
2364
            );
2365
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2366
        }
2367
        else
2368
        {
2369
#endif /* HAVE_MMX2 */
2370
        x86_reg xInc_shr16 = xInc >> 16;
2371
        uint16_t xInc_mask = xInc & 0xffff;
2372
        //NO MMX just normal asm ...
2373
        __asm__ volatile(
2374
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2375
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2376
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2377
        ASMALIGN(4)
2378
        "1:                                  \n\t"
2379
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2380
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2381
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2382
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2383
        "shll      $16, %%edi                \n\t"
2384
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2385
        "mov        %1, %%"REG_D"            \n\t"
2386
        "shrl       $9, %%esi                \n\t"
2387
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2388
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2389
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2390

    
2391
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2392
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2393
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2394
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2395
        "shll      $16, %%edi                \n\t"
2396
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2397
        "mov        %1, %%"REG_D"            \n\t"
2398
        "shrl       $9, %%esi                \n\t"
2399
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2400
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2401
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2402

    
2403

    
2404
        "add        $2, %%"REG_a"            \n\t"
2405
        "cmp        %2, %%"REG_a"            \n\t"
2406
        " jb        1b                       \n\t"
2407

    
2408

    
2409
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2410
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2411
        );
2412
#if HAVE_MMX2
2413
        } //if MMX2 can't be used
2414
#endif
2415
#else
2416
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2417
#endif /* ARCH_X86 */
2418
    }
2419

    
2420
    if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2421
        int i;
2422
        //FIXME all pal and rgb srcFormats could do this convertion as well
2423
        //FIXME all scalers more complex than bilinear could do half of this transform
2424
        if(c->srcRange){
2425
            for (i=0; i<dstWidth; i++)
2426
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2427
        }else{
2428
            for (i=0; i<dstWidth; i++)
2429
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2430
        }
2431
    }
2432
}
2433

    
2434
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2435
                                        int dstWidth, const uint8_t *src1,
2436
                                        const uint8_t *src2, int srcW, int xInc)
2437
{
2438
    int i;
2439
    unsigned int xpos=0;
2440
    for (i=0;i<dstWidth;i++)
2441
    {
2442
        register unsigned int xx=xpos>>16;
2443
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2444
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2445
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2446
        /* slower
2447
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2448
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2449
        */
2450
        xpos+=xInc;
2451
    }
2452
}
2453

    
2454
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2455
                                   int srcW, int xInc, int flags, const int16_t *hChrFilter,
2456
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2457
                                   int srcFormat, uint8_t *formatConvBuffer,
2458
                                   uint32_t *pal)
2459
{
2460
    int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
2461
    int16_t *mmx2Filter = c->chrMmx2Filter;
2462
    int canMMX2BeUsed = c->canMMX2BeUsed;
2463
    void *funnyUVCode = c->funnyUVCode;
2464

    
2465
    if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2466
        return;
2467

    
2468
    if (srcFormat==PIX_FMT_RGB32_1) {
2469
        src1 += ALT32_CORR;
2470
        src2 += ALT32_CORR;
2471
    }
2472

    
2473
    if (c->hcscale_internal) {
2474
        c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2475
        src1= formatConvBuffer;
2476
        src2= formatConvBuffer+VOFW;
2477
    }
2478

    
2479
#if HAVE_MMX
2480
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2481
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2482
#else
2483
    if (!(flags&SWS_FAST_BILINEAR))
2484
#endif
2485
    {
2486
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2487
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2488
    }
2489
    else // fast bilinear upscale / crap downscale
2490
    {
2491
#if ARCH_X86 && CONFIG_GPL
2492
#if HAVE_MMX2
2493
        int i;
2494
#if defined(PIC)
2495
        uint64_t ebxsave __attribute__((aligned(8)));
2496
#endif
2497
        if (canMMX2BeUsed)
2498
        {
2499
            __asm__ volatile(
2500
#if defined(PIC)
2501
            "mov          %%"REG_b", %6         \n\t"
2502
#endif
2503
            "pxor             %%mm7, %%mm7      \n\t"
2504
            "mov                 %0, %%"REG_c"  \n\t"
2505
            "mov                 %1, %%"REG_D"  \n\t"
2506
            "mov                 %2, %%"REG_d"  \n\t"
2507
            "mov                 %3, %%"REG_b"  \n\t"
2508
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2509
            PREFETCH"   (%%"REG_c")             \n\t"
2510
            PREFETCH" 32(%%"REG_c")             \n\t"
2511
            PREFETCH" 64(%%"REG_c")             \n\t"
2512

    
2513
#if ARCH_X86_64
2514

    
2515
#define FUNNY_UV_CODE \
2516
            "movl       (%%"REG_b"), %%esi      \n\t"\
2517
            "call               *%4             \n\t"\
2518
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2519
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2520
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2521
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2522

    
2523
#else
2524

    
2525
#define FUNNY_UV_CODE \
2526
            "movl       (%%"REG_b"), %%esi      \n\t"\
2527
            "call               *%4             \n\t"\
2528
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2529
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2530
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2531

    
2532
#endif /* ARCH_X86_64 */
2533

    
2534
FUNNY_UV_CODE
2535
FUNNY_UV_CODE
2536
FUNNY_UV_CODE
2537
FUNNY_UV_CODE
2538
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2539
            "mov                 %5, %%"REG_c"  \n\t" // src
2540
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2541
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2542
            PREFETCH"   (%%"REG_c")             \n\t"
2543
            PREFETCH" 32(%%"REG_c")             \n\t"
2544
            PREFETCH" 64(%%"REG_c")             \n\t"
2545

    
2546
FUNNY_UV_CODE
2547
FUNNY_UV_CODE
2548
FUNNY_UV_CODE
2549
FUNNY_UV_CODE
2550

    
2551
#if defined(PIC)
2552
            "mov %6, %%"REG_b"    \n\t"
2553
#endif
2554
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2555
            "m" (funnyUVCode), "m" (src2)
2556
#if defined(PIC)
2557
            ,"m" (ebxsave)
2558
#endif
2559
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2560
#if !defined(PIC)
2561
             ,"%"REG_b
2562
#endif
2563
            );
2564
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2565
            {
2566
                //printf("%d %d %d\n", dstWidth, i, srcW);
2567
                dst[i] = src1[srcW-1]*128;
2568
                dst[i+VOFW] = src2[srcW-1]*128;
2569
            }
2570
        }
2571
        else
2572
        {
2573
#endif /* HAVE_MMX2 */
2574
            x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2575
            uint16_t xInc_mask = xInc & 0xffff;
2576
            __asm__ volatile(
2577
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2578
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2579
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2580
            ASMALIGN(4)
2581
            "1:                                     \n\t"
2582
            "mov        %0, %%"REG_S"               \n\t"
2583
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2584
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2585
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2586
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2587
            "shll      $16, %%edi                   \n\t"
2588
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2589
            "mov        %1, %%"REG_D"               \n\t"
2590
            "shrl       $9, %%esi                   \n\t"
2591
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2592

    
2593
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2594
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2595
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2596
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2597
            "shll      $16, %%edi                   \n\t"
2598
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2599
            "mov        %1, %%"REG_D"               \n\t"
2600
            "shrl       $9, %%esi                   \n\t"
2601
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2602

    
2603
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2604
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2605
            "add        $1, %%"REG_a"               \n\t"
2606
            "cmp        %2, %%"REG_a"               \n\t"
2607
            " jb        1b                          \n\t"
2608

    
2609
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2610
   which is needed to support GCC 4.0. */
2611
#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2612
            :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2613
#else
2614
            :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2615
#endif
2616
            "r" (src2)
2617
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2618
            );
2619
#if HAVE_MMX2
2620
        } //if MMX2 can't be used
2621
#endif
2622
#else
2623
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2624
#endif /* ARCH_X86 */
2625
    }
2626
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2627
        int i;
2628
        //FIXME all pal and rgb srcFormats could do this convertion as well
2629
        //FIXME all scalers more complex than bilinear could do half of this transform
2630
        if(c->srcRange){
2631
            for (i=0; i<dstWidth; i++){
2632
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2633
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2634
            }
2635
        }else{
2636
            for (i=0; i<dstWidth; i++){
2637
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2638
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2639
            }
2640
        }
2641
    }
2642
}
2643

    
2644
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2645
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2646

    
2647
    /* load a few things into local vars to make the code more readable? and faster */
2648
    const int srcW= c->srcW;
2649
    const int dstW= c->dstW;
2650
    const int dstH= c->dstH;
2651
    const int chrDstW= c->chrDstW;
2652
    const int chrSrcW= c->chrSrcW;
2653
    const int lumXInc= c->lumXInc;
2654
    const int chrXInc= c->chrXInc;
2655
    const int dstFormat= c->dstFormat;
2656
    const int srcFormat= c->srcFormat;
2657
    const int flags= c->flags;
2658
    int16_t *vLumFilterPos= c->vLumFilterPos;
2659
    int16_t *vChrFilterPos= c->vChrFilterPos;
2660
    int16_t *hLumFilterPos= c->hLumFilterPos;
2661
    int16_t *hChrFilterPos= c->hChrFilterPos;
2662
    int16_t *vLumFilter= c->vLumFilter;
2663
    int16_t *vChrFilter= c->vChrFilter;
2664
    int16_t *hLumFilter= c->hLumFilter;
2665
    int16_t *hChrFilter= c->hChrFilter;
2666
    int32_t *lumMmxFilter= c->lumMmxFilter;
2667
    int32_t *chrMmxFilter= c->chrMmxFilter;
2668
    int32_t *alpMmxFilter= c->alpMmxFilter;
2669
    const int vLumFilterSize= c->vLumFilterSize;
2670
    const int vChrFilterSize= c->vChrFilterSize;
2671
    const int hLumFilterSize= c->hLumFilterSize;
2672
    const int hChrFilterSize= c->hChrFilterSize;
2673
    int16_t **lumPixBuf= c->lumPixBuf;
2674
    int16_t **chrPixBuf= c->chrPixBuf;
2675
    int16_t **alpPixBuf= c->alpPixBuf;
2676
    const int vLumBufSize= c->vLumBufSize;
2677
    const int vChrBufSize= c->vChrBufSize;
2678
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2679
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2680
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2681
    int lastDstY;
2682
    uint32_t *pal=c->pal_yuv;
2683

    
2684
    /* vars which will change and which we need to store back in the context */
2685
    int dstY= c->dstY;
2686
    int lumBufIndex= c->lumBufIndex;
2687
    int chrBufIndex= c->chrBufIndex;
2688
    int lastInLumBuf= c->lastInLumBuf;
2689
    int lastInChrBuf= c->lastInChrBuf;
2690

    
2691
    if (isPacked(c->srcFormat)){
2692
        src[0]=
2693
        src[1]=
2694
        src[2]=
2695
        src[3]= src[0];
2696
        srcStride[0]=
2697
        srcStride[1]=
2698
        srcStride[2]=
2699
        srcStride[3]= srcStride[0];
2700
    }
2701
    srcStride[1]<<= c->vChrDrop;
2702
    srcStride[2]<<= c->vChrDrop;
2703

    
2704
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2705
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2706

    
2707
#if 0 //self test FIXME move to a vfilter or something
2708
    {
2709
    static volatile int i=0;
2710
    i++;
2711
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2712
        selfTest(src, srcStride, c->srcW, c->srcH);
2713
    i--;
2714
    }
2715
#endif
2716

    
2717
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2718
    //dstStride[0],dstStride[1],dstStride[2]);
2719

    
2720
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2721
    {
2722
        static int warnedAlready=0; //FIXME move this into the context perhaps
2723
        if (flags & SWS_PRINT_INFO && !warnedAlready)
2724
        {
2725
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2726
                   "         ->cannot do aligned memory accesses anymore\n");
2727
            warnedAlready=1;
2728
        }
2729
    }
2730

    
2731
    /* Note the user might start scaling the picture in the middle so this
2732
       will not get executed. This is not really intended but works
2733
       currently, so people might do it. */
2734
    if (srcSliceY ==0){
2735
        lumBufIndex=0;
2736
        chrBufIndex=0;
2737
        dstY=0;
2738
        lastInLumBuf= -1;
2739
        lastInChrBuf= -1;
2740
    }
2741

    
2742
    lastDstY= dstY;
2743

    
2744
    for (;dstY < dstH; dstY++){
2745
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2746
        const int chrDstY= dstY>>c->chrDstVSubSample;
2747
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2748
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2749
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2750

    
2751
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2752
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2753
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2754
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2755

    
2756
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2757
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2758
        //handle holes (FAST_BILINEAR & weird filters)
2759
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2760
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2761
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2762
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2763
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2764

    
2765
        // Do we have enough lines in this slice to output the dstY line
2766
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
2767
        {
2768
            //Do horizontal scaling
2769
            while(lastInLumBuf < lastLumSrcY)
2770
            {
2771
                uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2772
                uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2773
                lumBufIndex++;
2774
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2775
                assert(lumBufIndex < 2*vLumBufSize);
2776
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2777
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2778
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
2779
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2780
                                flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2781
                                c->srcFormat, formatConvBuffer,
2782
                                pal, 0);
2783
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2784
                    RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2785
                                    flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2786
                                    c->srcFormat, formatConvBuffer,
2787
                                    pal, 1);
2788
                lastInLumBuf++;
2789
            }
2790
            while(lastInChrBuf < lastChrSrcY)
2791
            {
2792
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2793
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2794
                chrBufIndex++;
2795
                assert(chrBufIndex < 2*vChrBufSize);
2796
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2797
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2798
                //FIXME replace parameters through context struct (some at least)
2799

    
2800
                if (!(isGray(srcFormat) || isGray(dstFormat)))
2801
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2802
                                    flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2803
                                    c->srcFormat, formatConvBuffer,
2804
                                    pal);
2805
                lastInChrBuf++;
2806
            }
2807
            //wrap buf index around to stay inside the ring buffer
2808
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2809
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2810
        }
2811
        else // not enough lines left in this slice -> load the rest in the buffer
2812
        {
2813
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2814
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2815
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2816
            vChrBufSize, vLumBufSize);*/
2817

    
2818
            //Do horizontal scaling
2819
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
2820
            {
2821
                uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2822
                uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2823
                lumBufIndex++;
2824
                assert(lumBufIndex < 2*vLumBufSize);
2825
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2826
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
2827
                RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2828
                                flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2829
                                c->srcFormat, formatConvBuffer,
2830
                                pal, 0);
2831
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2832
                    RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2833
                                    flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2834
                                    c->srcFormat, formatConvBuffer,
2835
                                    pal, 1);
2836
                lastInLumBuf++;
2837
            }
2838
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
2839
            {
2840
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2841
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2842
                chrBufIndex++;
2843
                assert(chrBufIndex < 2*vChrBufSize);
2844
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
2845
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2846

    
2847
                if (!(isGray(srcFormat) || isGray(dstFormat)))
2848
                    RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2849
                            flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2850
                            c->srcFormat, formatConvBuffer,
2851
                            pal);
2852
                lastInChrBuf++;
2853
            }
2854
            //wrap buf index around to stay inside the ring buffer
2855
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2856
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2857
            break; //we can't output a dstY line so let's try with the next slice
2858
        }
2859

    
2860
#if HAVE_MMX
2861
        c->blueDither= ff_dither8[dstY&1];
2862
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2863
            c->greenDither= ff_dither8[dstY&1];
2864
        else
2865
            c->greenDither= ff_dither4[dstY&1];
2866
        c->redDither= ff_dither8[(dstY+1)&1];
2867
#endif
2868
        if (dstY < dstH-2)
2869
        {
2870
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2871
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2872
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2873
#if HAVE_MMX
2874
            int i;
2875
        if (flags & SWS_ACCURATE_RND){
2876
            int s= APCK_SIZE / 8;
2877
            for (i=0; i<vLumFilterSize; i+=2){
2878
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2879
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2880
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
2881
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2882
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2883
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2884
                    *(void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2885
                    *(void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2886
                              alpMmxFilter[s*i+APCK_COEF/4  ]=
2887
                              alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2888
                }
2889
            }
2890
            for (i=0; i<vChrFilterSize; i+=2){
2891
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2892
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2893
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
2894
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2895
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2896
            }
2897
        }else{
2898
            for (i=0; i<vLumFilterSize; i++)
2899
            {
2900
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2901
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2902
                lumMmxFilter[4*i+2]=
2903
                lumMmxFilter[4*i+3]=
2904
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2905
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2906
                    alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2907
                    alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2908
                    alpMmxFilter[4*i+2]=
2909
                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2910
                }
2911
            }
2912
            for (i=0; i<vChrFilterSize; i++)
2913
            {
2914
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2915
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2916
                chrMmxFilter[4*i+2]=
2917
                chrMmxFilter[4*i+3]=
2918
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2919
            }
2920
        }
2921
#endif
2922
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2923
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2924
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2925
                c->yuv2nv12X(c,
2926
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2927
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2928
                    dest, uDest, dstW, chrDstW, dstFormat);
2929
            }
2930
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2931
            {
2932
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2933
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2934
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2935
                {
2936
                    int16_t *lumBuf = lumPixBuf[0];
2937
                    int16_t *chrBuf= chrPixBuf[0];
2938
                    int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
2939
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2940
                }
2941
                else //General YV12
2942
                {
2943
                    c->yuv2yuvX(c,
2944
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2945
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2946
                        alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2947
                }
2948
            }
2949
            else
2950
            {
2951
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2952
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2953
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2954
                {
2955
                    int chrAlpha= vChrFilter[2*dstY+1];
2956
                    if(flags & SWS_FULL_CHR_H_INT){
2957
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2958
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2959
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2960
                            alpSrcPtr, dest, dstW, dstY);
2961
                    }else{
2962
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2963
                            alpPixBuf ? *alpSrcPtr : NULL,
2964
                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
2965
                    }
2966
                }
2967
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2968
                {
2969
                    int lumAlpha= vLumFilter[2*dstY+1];
2970
                    int chrAlpha= vChrFilter[2*dstY+1];
2971
                    lumMmxFilter[2]=
2972
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2973
                    chrMmxFilter[2]=
2974
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2975
                    if(flags & SWS_FULL_CHR_H_INT){
2976
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2977
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2978
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2979
                            alpSrcPtr, dest, dstW, dstY);
2980
                    }else{
2981
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2982
                            alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2983
                            dest, dstW, lumAlpha, chrAlpha, dstY);
2984
                    }
2985
                }
2986
                else //general RGB
2987
                {
2988
                    if(flags & SWS_FULL_CHR_H_INT){
2989
                        yuv2rgbXinC_full(c,
2990
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2991
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2992
                            alpSrcPtr, dest, dstW, dstY);
2993
                    }else{
2994
                        c->yuv2packedX(c,
2995
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2996
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2997
                            alpSrcPtr, dest, dstW, dstY);
2998
                    }
2999
                }
3000
            }
3001
        }
3002
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3003
        {
3004
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3005
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3006
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
3007
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3008
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3009
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3010
                yuv2nv12XinC(
3011
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3012
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3013
                    dest, uDest, dstW, chrDstW, dstFormat);
3014
            }
3015
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
3016
            {
3017
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3018
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3019
                yuv2yuvXinC(
3020
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3021
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3022
                    alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
3023
            }
3024
            else
3025
            {
3026
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3027
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3028
                if(flags & SWS_FULL_CHR_H_INT){
3029
                    yuv2rgbXinC_full(c,
3030
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3031
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3032
                        alpSrcPtr, dest, dstW, dstY);
3033
                }else{
3034
                    yuv2packedXinC(c,
3035
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3036
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3037
                        alpSrcPtr, dest, dstW, dstY);
3038
                }
3039
            }
3040
        }
3041
    }
3042

    
3043
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
3044
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
3045

    
3046
#if HAVE_MMX
3047
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
3048
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
3049
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
3050
    else                             __asm__ volatile("emms"  :::"memory");
3051
#endif
3052
    /* store changed local vars back in the context */
3053
    c->dstY= dstY;
3054
    c->lumBufIndex= lumBufIndex;
3055
    c->chrBufIndex= chrBufIndex;
3056
    c->lastInLumBuf= lastInLumBuf;
3057
    c->lastInChrBuf= lastInChrBuf;
3058

    
3059
    return dstY - lastDstY;
3060
}
3061

    
3062
static void RENAME(sws_init_swScale)(SwsContext *c)
3063
{
3064
    enum PixelFormat srcFormat = c->srcFormat;
3065

    
3066
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
3067
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
3068
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
3069
    c->yuv2packed1  = RENAME(yuv2packed1 );
3070
    c->yuv2packed2  = RENAME(yuv2packed2 );
3071
    c->yuv2packedX  = RENAME(yuv2packedX );
3072

    
3073
    c->hScale       = RENAME(hScale      );
3074

    
3075
    c->hyscale_fast = RENAME(hyscale_fast);
3076
    c->hcscale_fast = RENAME(hcscale_fast);
3077

    
3078
    c->hcscale_internal = NULL;
3079
    switch(srcFormat) {
3080
        case PIX_FMT_YUYV422  : c->hcscale_internal = RENAME(yuy2ToUV); break;
3081
        case PIX_FMT_UYVY422  : c->hcscale_internal = RENAME(uyvyToUV); break;
3082
        case PIX_FMT_RGB8     :
3083
        case PIX_FMT_BGR8     :
3084
        case PIX_FMT_PAL8     :
3085
        case PIX_FMT_BGR4_BYTE:
3086
        case PIX_FMT_RGB4_BYTE: c->hcscale_internal = RENAME(palToUV); break;
3087
    }
3088
    if (c->chrSrcHSubSample) {
3089
        switch(srcFormat) {
3090
        case PIX_FMT_RGB32  :
3091
        case PIX_FMT_RGB32_1: c->hcscale_internal = RENAME(bgr32ToUV_half); break;
3092
        case PIX_FMT_BGR24  : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
3093
        case PIX_FMT_BGR565 : c->hcscale_internal = RENAME(bgr16ToUV_half); break;
3094
        case PIX_FMT_BGR555 : c->hcscale_internal = RENAME(bgr15ToUV_half); break;
3095
        case PIX_FMT_BGR32  :
3096
        case PIX_FMT_BGR32_1: c->hcscale_internal = RENAME(rgb32ToUV_half); break;
3097
        case PIX_FMT_RGB24  : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
3098
        case PIX_FMT_RGB565 : c->hcscale_internal = RENAME(rgb16ToUV_half); break;
3099
        case PIX_FMT_RGB555 : c->hcscale_internal = RENAME(rgb15ToUV_half); break;
3100
        }
3101
    } else {
3102
        switch(srcFormat) {
3103
        case PIX_FMT_RGB32  :
3104
        case PIX_FMT_RGB32_1: c->hcscale_internal = RENAME(bgr32ToUV); break;
3105
        case PIX_FMT_BGR24  : c->hcscale_internal = RENAME(bgr24ToUV); break;
3106
        case PIX_FMT_BGR565 : c->hcscale_internal = RENAME(bgr16ToUV); break;
3107
        case PIX_FMT_BGR555 : c->hcscale_internal = RENAME(bgr15ToUV); break;
3108
        case PIX_FMT_BGR32  :
3109
        case PIX_FMT_BGR32_1: c->hcscale_internal = RENAME(rgb32ToUV); break;
3110
        case PIX_FMT_RGB24  : c->hcscale_internal = RENAME(rgb24ToUV); break;
3111
        case PIX_FMT_RGB565 : c->hcscale_internal = RENAME(rgb16ToUV); break;
3112
        case PIX_FMT_RGB555 : c->hcscale_internal = RENAME(rgb15ToUV); break;
3113
        }
3114
    }
3115

    
3116
    c->hyscale_internal = NULL;
3117
    switch (srcFormat) {
3118
    case PIX_FMT_YUYV422  :
3119
    case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
3120
    case PIX_FMT_UYVY422  :
3121
    case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
3122
    case PIX_FMT_BGR24    : c->hyscale_internal = RENAME(bgr24ToY); break;
3123
    case PIX_FMT_BGR565   : c->hyscale_internal = RENAME(bgr16ToY); break;
3124
    case PIX_FMT_BGR555   : c->hyscale_internal = RENAME(bgr15ToY); break;
3125
    case PIX_FMT_RGB24    : c->hyscale_internal = RENAME(rgb24ToY); break;
3126
    case PIX_FMT_RGB565   : c->hyscale_internal = RENAME(rgb16ToY); break;
3127
    case PIX_FMT_RGB555   : c->hyscale_internal = RENAME(rgb15ToY); break;
3128
    case PIX_FMT_RGB8     :
3129
    case PIX_FMT_BGR8     :
3130
    case PIX_FMT_PAL8     :
3131
    case PIX_FMT_BGR4_BYTE:
3132
    case PIX_FMT_RGB4_BYTE: c->hyscale_internal = RENAME(palToY); break;
3133
    case PIX_FMT_MONOBLACK: c->hyscale_internal = RENAME(monoblack2Y); break;
3134
    case PIX_FMT_MONOWHITE: c->hyscale_internal = RENAME(monowhite2Y); break;
3135
    }
3136
    if (c->alpPixBuf) {
3137
        switch (srcFormat) {
3138
        case PIX_FMT_RGB32  :
3139
        case PIX_FMT_RGB32_1:
3140
        case PIX_FMT_BGR32  :
3141
        case PIX_FMT_BGR32_1: c->hyscale_internal = RENAME(abgrToA); break;
3142
        }
3143
    } else {
3144
        switch (srcFormat) {
3145
        case PIX_FMT_RGB32  :
3146
        case PIX_FMT_RGB32_1: c->hyscale_internal = RENAME(bgr32ToY); break;
3147
        case PIX_FMT_BGR32  :
3148
        case PIX_FMT_BGR32_1: c->hyscale_internal = RENAME(rgb32ToY); break;
3149
        }
3150
    }
3151
}