Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ bae76dc3

History | View | Annotate | Download (139 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29

    
30
#if COMPILE_TEMPLATE_AMD3DNOW
31
#define PREFETCH  "prefetch"
32
#define PREFETCHW "prefetchw"
33
#elif COMPILE_TEMPLATE_MMX2
34
#define PREFETCH "prefetchnta"
35
#define PREFETCHW "prefetcht0"
36
#else
37
#define PREFETCH  " # nop"
38
#define PREFETCHW " # nop"
39
#endif
40

    
41
#if COMPILE_TEMPLATE_MMX2
42
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43
#elif COMPILE_TEMPLATE_AMD3DNOW
44
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45
#endif
46

    
47
#if COMPILE_TEMPLATE_MMX2
48
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49
#else
50
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51
#endif
52
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
53

    
54
#if COMPILE_TEMPLATE_ALTIVEC
55
#include "ppc/swscale_altivec_template.c"
56
#endif
57

    
58
#define YSCALEYUV2YV12X(x, offset, dest, width) \
59
    __asm__ volatile(\
60
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
61
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
62
        "movq                             %%mm3, %%mm4      \n\t"\
63
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
64
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
        ASMALIGN(4) /* FIXME Unroll? */\
66
        "1:                                                 \n\t"\
67
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
68
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
69
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
70
        "add                                $16, %%"REG_d"  \n\t"\
71
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
72
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
73
        "pmulhw                           %%mm0, %%mm2      \n\t"\
74
        "pmulhw                           %%mm0, %%mm5      \n\t"\
75
        "paddw                            %%mm2, %%mm3      \n\t"\
76
        "paddw                            %%mm5, %%mm4      \n\t"\
77
        " jnz                                1b             \n\t"\
78
        "psraw                               $3, %%mm3      \n\t"\
79
        "psraw                               $3, %%mm4      \n\t"\
80
        "packuswb                         %%mm4, %%mm3      \n\t"\
81
        MOVNTQ(%%mm3, (%1, %%REGa))\
82
        "add                                 $8, %%"REG_a"  \n\t"\
83
        "cmp                                 %2, %%"REG_a"  \n\t"\
84
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
85
        "movq                             %%mm3, %%mm4      \n\t"\
86
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
87
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
88
        "jb                                  1b             \n\t"\
89
        :: "r" (&c->redDither),\
90
        "r" (dest), "g" (width)\
91
        : "%"REG_a, "%"REG_d, "%"REG_S\
92
    );
93

    
94
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95
    __asm__ volatile(\
96
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
97
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
98
        "pxor                             %%mm4, %%mm4      \n\t"\
99
        "pxor                             %%mm5, %%mm5      \n\t"\
100
        "pxor                             %%mm6, %%mm6      \n\t"\
101
        "pxor                             %%mm7, %%mm7      \n\t"\
102
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
        ASMALIGN(4) \
104
        "1:                                                 \n\t"\
105
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
106
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
107
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
108
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
109
        "movq                             %%mm0, %%mm3      \n\t"\
110
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
111
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
112
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
113
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
114
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
115
        "paddd                            %%mm0, %%mm4      \n\t"\
116
        "paddd                            %%mm3, %%mm5      \n\t"\
117
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
118
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
119
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
120
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
121
        "movq                             %%mm2, %%mm0      \n\t"\
122
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
123
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
124
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
125
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
126
        "paddd                            %%mm2, %%mm6      \n\t"\
127
        "paddd                            %%mm0, %%mm7      \n\t"\
128
        " jnz                                1b             \n\t"\
129
        "psrad                              $16, %%mm4      \n\t"\
130
        "psrad                              $16, %%mm5      \n\t"\
131
        "psrad                              $16, %%mm6      \n\t"\
132
        "psrad                              $16, %%mm7      \n\t"\
133
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
134
        "packssdw                         %%mm5, %%mm4      \n\t"\
135
        "packssdw                         %%mm7, %%mm6      \n\t"\
136
        "paddw                            %%mm0, %%mm4      \n\t"\
137
        "paddw                            %%mm0, %%mm6      \n\t"\
138
        "psraw                               $3, %%mm4      \n\t"\
139
        "psraw                               $3, %%mm6      \n\t"\
140
        "packuswb                         %%mm6, %%mm4      \n\t"\
141
        MOVNTQ(%%mm4, (%1, %%REGa))\
142
        "add                                 $8, %%"REG_a"  \n\t"\
143
        "cmp                                 %2, %%"REG_a"  \n\t"\
144
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
145
        "pxor                             %%mm4, %%mm4      \n\t"\
146
        "pxor                             %%mm5, %%mm5      \n\t"\
147
        "pxor                             %%mm6, %%mm6      \n\t"\
148
        "pxor                             %%mm7, %%mm7      \n\t"\
149
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
150
        "jb                                  1b             \n\t"\
151
        :: "r" (&c->redDither),\
152
        "r" (dest), "g" (width)\
153
        : "%"REG_a, "%"REG_d, "%"REG_S\
154
    );
155

    
156
#define YSCALEYUV2YV121 \
157
    "mov %2, %%"REG_a"                    \n\t"\
158
    ASMALIGN(4) /* FIXME Unroll? */\
159
    "1:                                   \n\t"\
160
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
161
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
162
    "psraw                 $7, %%mm0      \n\t"\
163
    "psraw                 $7, %%mm1      \n\t"\
164
    "packuswb           %%mm1, %%mm0      \n\t"\
165
    MOVNTQ(%%mm0, (%1, %%REGa))\
166
    "add                   $8, %%"REG_a"  \n\t"\
167
    "jnc                   1b             \n\t"
168

    
169
#define YSCALEYUV2YV121_ACCURATE \
170
    "mov %2, %%"REG_a"                    \n\t"\
171
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
172
    "psrlw                 $15, %%mm7     \n\t"\
173
    "psllw                  $6, %%mm7     \n\t"\
174
    ASMALIGN(4) /* FIXME Unroll? */\
175
    "1:                                   \n\t"\
176
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
177
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
178
    "paddsw             %%mm7, %%mm0      \n\t"\
179
    "paddsw             %%mm7, %%mm1      \n\t"\
180
    "psraw                 $7, %%mm0      \n\t"\
181
    "psraw                 $7, %%mm1      \n\t"\
182
    "packuswb           %%mm1, %%mm0      \n\t"\
183
    MOVNTQ(%%mm0, (%1, %%REGa))\
184
    "add                   $8, %%"REG_a"  \n\t"\
185
    "jnc                   1b             \n\t"
186

    
187
/*
188
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190
       "r" (dest), "m" (dstW),
191
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193
*/
194
#define YSCALEYUV2PACKEDX_UV \
195
    __asm__ volatile(\
196
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
197
        ASMALIGN(4)\
198
        "nop                                            \n\t"\
199
        "1:                                             \n\t"\
200
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
201
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
202
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
203
        "movq                      %%mm3, %%mm4         \n\t"\
204
        ASMALIGN(4)\
205
        "2:                                             \n\t"\
206
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
207
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
208
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
209
        "add                         $16, %%"REG_d"     \n\t"\
210
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
211
        "pmulhw                    %%mm0, %%mm2         \n\t"\
212
        "pmulhw                    %%mm0, %%mm5         \n\t"\
213
        "paddw                     %%mm2, %%mm3         \n\t"\
214
        "paddw                     %%mm5, %%mm4         \n\t"\
215
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
216
        " jnz                         2b                \n\t"\
217

    
218
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
220
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
221
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
222
    "movq                    "#dst1", "#dst2"       \n\t"\
223
    ASMALIGN(4)\
224
    "2:                                             \n\t"\
225
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
226
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
227
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
228
    "add                         $16, %%"REG_d"            \n\t"\
229
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
230
    "pmulhw                 "#coeff", "#src1"       \n\t"\
231
    "pmulhw                 "#coeff", "#src2"       \n\t"\
232
    "paddw                   "#src1", "#dst1"       \n\t"\
233
    "paddw                   "#src2", "#dst2"       \n\t"\
234
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
235
    " jnz                         2b                \n\t"\
236

    
237
#define YSCALEYUV2PACKEDX \
238
    YSCALEYUV2PACKEDX_UV \
239
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240

    
241
#define YSCALEYUV2PACKEDX_END                     \
242
        :: "r" (&c->redDither),                   \
243
            "m" (dummy), "m" (dummy), "m" (dummy),\
244
            "r" (dest), "m" (dstW)                \
245
        : "%"REG_a, "%"REG_d, "%"REG_S            \
246
    );
247

    
248
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
249
    __asm__ volatile(\
250
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
251
        ASMALIGN(4)\
252
        "nop                                            \n\t"\
253
        "1:                                             \n\t"\
254
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
255
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
256
        "pxor                      %%mm4, %%mm4         \n\t"\
257
        "pxor                      %%mm5, %%mm5         \n\t"\
258
        "pxor                      %%mm6, %%mm6         \n\t"\
259
        "pxor                      %%mm7, %%mm7         \n\t"\
260
        ASMALIGN(4)\
261
        "2:                                             \n\t"\
262
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
263
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
264
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
265
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
266
        "movq                      %%mm0, %%mm3         \n\t"\
267
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
268
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
269
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
270
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
271
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
272
        "paddd                     %%mm0, %%mm4         \n\t"\
273
        "paddd                     %%mm3, %%mm5         \n\t"\
274
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
275
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
276
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
277
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
278
        "movq                      %%mm2, %%mm0         \n\t"\
279
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
280
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
281
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
282
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
283
        "paddd                     %%mm2, %%mm6         \n\t"\
284
        "paddd                     %%mm0, %%mm7         \n\t"\
285
        " jnz                         2b                \n\t"\
286
        "psrad                       $16, %%mm4         \n\t"\
287
        "psrad                       $16, %%mm5         \n\t"\
288
        "psrad                       $16, %%mm6         \n\t"\
289
        "psrad                       $16, %%mm7         \n\t"\
290
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
291
        "packssdw                  %%mm5, %%mm4         \n\t"\
292
        "packssdw                  %%mm7, %%mm6         \n\t"\
293
        "paddw                     %%mm0, %%mm4         \n\t"\
294
        "paddw                     %%mm0, %%mm6         \n\t"\
295
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
296
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
297

    
298
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
300
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
301
    "pxor                      %%mm1, %%mm1         \n\t"\
302
    "pxor                      %%mm5, %%mm5         \n\t"\
303
    "pxor                      %%mm7, %%mm7         \n\t"\
304
    "pxor                      %%mm6, %%mm6         \n\t"\
305
    ASMALIGN(4)\
306
    "2:                                             \n\t"\
307
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
308
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
309
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
310
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
311
    "movq                      %%mm0, %%mm3         \n\t"\
312
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
313
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
314
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
315
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
316
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
317
    "paddd                     %%mm0, %%mm1         \n\t"\
318
    "paddd                     %%mm3, %%mm5         \n\t"\
319
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
320
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
321
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
322
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
323
    "movq                      %%mm2, %%mm0         \n\t"\
324
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
325
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
326
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
327
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
328
    "paddd                     %%mm2, %%mm7         \n\t"\
329
    "paddd                     %%mm0, %%mm6         \n\t"\
330
    " jnz                         2b                \n\t"\
331
    "psrad                       $16, %%mm1         \n\t"\
332
    "psrad                       $16, %%mm5         \n\t"\
333
    "psrad                       $16, %%mm7         \n\t"\
334
    "psrad                       $16, %%mm6         \n\t"\
335
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
336
    "packssdw                  %%mm5, %%mm1         \n\t"\
337
    "packssdw                  %%mm6, %%mm7         \n\t"\
338
    "paddw                     %%mm0, %%mm1         \n\t"\
339
    "paddw                     %%mm0, %%mm7         \n\t"\
340
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
341
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
342

    
343
#define YSCALEYUV2PACKEDX_ACCURATE \
344
    YSCALEYUV2PACKEDX_ACCURATE_UV \
345
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346

    
347
#define YSCALEYUV2RGBX \
348
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
349
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
350
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
351
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
352
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
353
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
354
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
356
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
357
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
358
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
359
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
360
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
361
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362
    "paddw           %%mm3, %%mm4       \n\t"\
363
    "movq            %%mm2, %%mm0       \n\t"\
364
    "movq            %%mm5, %%mm6       \n\t"\
365
    "movq            %%mm4, %%mm3       \n\t"\
366
    "punpcklwd       %%mm2, %%mm2       \n\t"\
367
    "punpcklwd       %%mm5, %%mm5       \n\t"\
368
    "punpcklwd       %%mm4, %%mm4       \n\t"\
369
    "paddw           %%mm1, %%mm2       \n\t"\
370
    "paddw           %%mm1, %%mm5       \n\t"\
371
    "paddw           %%mm1, %%mm4       \n\t"\
372
    "punpckhwd       %%mm0, %%mm0       \n\t"\
373
    "punpckhwd       %%mm6, %%mm6       \n\t"\
374
    "punpckhwd       %%mm3, %%mm3       \n\t"\
375
    "paddw           %%mm7, %%mm0       \n\t"\
376
    "paddw           %%mm7, %%mm6       \n\t"\
377
    "paddw           %%mm7, %%mm3       \n\t"\
378
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379
    "packuswb        %%mm0, %%mm2       \n\t"\
380
    "packuswb        %%mm6, %%mm5       \n\t"\
381
    "packuswb        %%mm3, %%mm4       \n\t"\
382

    
383
#define REAL_YSCALEYUV2PACKED(index, c) \
384
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
385
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
386
    "psraw                $3, %%mm0                           \n\t"\
387
    "psraw                $3, %%mm1                           \n\t"\
388
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390
    "xor            "#index", "#index"                        \n\t"\
391
    ASMALIGN(4)\
392
    "1:                                 \n\t"\
393
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
394
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
395
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
396
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
397
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
400
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
407
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
408
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
409
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
410
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
411
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
412
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418

    
419
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
420

    
421
#define REAL_YSCALEYUV2RGB_UV(index, c) \
422
    "xor            "#index", "#index"  \n\t"\
423
    ASMALIGN(4)\
424
    "1:                                 \n\t"\
425
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
428
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
429
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
432
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
439
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
440
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
441
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
442
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
443
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
444
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445

    
446
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
448
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
449
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
450
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
451
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
452
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459

    
460
#define REAL_YSCALEYUV2RGB_COEFF(c) \
461
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
462
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
463
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
464
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
465
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
466
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
467
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468
    "paddw             %%mm3, %%mm4     \n\t"\
469
    "movq              %%mm2, %%mm0     \n\t"\
470
    "movq              %%mm5, %%mm6     \n\t"\
471
    "movq              %%mm4, %%mm3     \n\t"\
472
    "punpcklwd         %%mm2, %%mm2     \n\t"\
473
    "punpcklwd         %%mm5, %%mm5     \n\t"\
474
    "punpcklwd         %%mm4, %%mm4     \n\t"\
475
    "paddw             %%mm1, %%mm2     \n\t"\
476
    "paddw             %%mm1, %%mm5     \n\t"\
477
    "paddw             %%mm1, %%mm4     \n\t"\
478
    "punpckhwd         %%mm0, %%mm0     \n\t"\
479
    "punpckhwd         %%mm6, %%mm6     \n\t"\
480
    "punpckhwd         %%mm3, %%mm3     \n\t"\
481
    "paddw             %%mm7, %%mm0     \n\t"\
482
    "paddw             %%mm7, %%mm6     \n\t"\
483
    "paddw             %%mm7, %%mm3     \n\t"\
484
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485
    "packuswb          %%mm0, %%mm2     \n\t"\
486
    "packuswb          %%mm6, %%mm5     \n\t"\
487
    "packuswb          %%mm3, %%mm4     \n\t"\
488

    
489
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490

    
491
#define YSCALEYUV2RGB(index, c) \
492
    REAL_YSCALEYUV2RGB_UV(index, c) \
493
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494
    REAL_YSCALEYUV2RGB_COEFF(c)
495

    
496
#define REAL_YSCALEYUV2PACKED1(index, c) \
497
    "xor            "#index", "#index"  \n\t"\
498
    ASMALIGN(4)\
499
    "1:                                 \n\t"\
500
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
501
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
502
    "psraw                $7, %%mm3     \n\t" \
503
    "psraw                $7, %%mm4     \n\t" \
504
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
505
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
506
    "psraw                $7, %%mm1     \n\t" \
507
    "psraw                $7, %%mm7     \n\t" \
508

    
509
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
510

    
511
#define REAL_YSCALEYUV2RGB1(index, c) \
512
    "xor            "#index", "#index"  \n\t"\
513
    ASMALIGN(4)\
514
    "1:                                 \n\t"\
515
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
520
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
521
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
522
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
523
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
524
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
525
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
527
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
528
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
531
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
532
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
533
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
534
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
535
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
536
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537
    "paddw             %%mm3, %%mm4     \n\t"\
538
    "movq              %%mm2, %%mm0     \n\t"\
539
    "movq              %%mm5, %%mm6     \n\t"\
540
    "movq              %%mm4, %%mm3     \n\t"\
541
    "punpcklwd         %%mm2, %%mm2     \n\t"\
542
    "punpcklwd         %%mm5, %%mm5     \n\t"\
543
    "punpcklwd         %%mm4, %%mm4     \n\t"\
544
    "paddw             %%mm1, %%mm2     \n\t"\
545
    "paddw             %%mm1, %%mm5     \n\t"\
546
    "paddw             %%mm1, %%mm4     \n\t"\
547
    "punpckhwd         %%mm0, %%mm0     \n\t"\
548
    "punpckhwd         %%mm6, %%mm6     \n\t"\
549
    "punpckhwd         %%mm3, %%mm3     \n\t"\
550
    "paddw             %%mm7, %%mm0     \n\t"\
551
    "paddw             %%mm7, %%mm6     \n\t"\
552
    "paddw             %%mm7, %%mm3     \n\t"\
553
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554
    "packuswb          %%mm0, %%mm2     \n\t"\
555
    "packuswb          %%mm6, %%mm5     \n\t"\
556
    "packuswb          %%mm3, %%mm4     \n\t"\
557

    
558
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
559

    
560
#define REAL_YSCALEYUV2PACKED1b(index, c) \
561
    "xor "#index", "#index"             \n\t"\
562
    ASMALIGN(4)\
563
    "1:                                 \n\t"\
564
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
565
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
566
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
568
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570
    "psrlw                $8, %%mm3     \n\t" \
571
    "psrlw                $8, %%mm4     \n\t" \
572
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
573
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
574
    "psraw                $7, %%mm1     \n\t" \
575
    "psraw                $7, %%mm7     \n\t"
576
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
577

    
578
// do vertical chrominance interpolation
579
#define REAL_YSCALEYUV2RGB1b(index, c) \
580
    "xor            "#index", "#index"  \n\t"\
581
    ASMALIGN(4)\
582
    "1:                                 \n\t"\
583
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
584
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
585
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
586
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
587
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
590
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
591
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
592
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
593
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
594
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
595
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
596
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
597
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
599
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
600
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
603
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
604
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
605
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
606
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
607
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
608
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609
    "paddw             %%mm3, %%mm4     \n\t"\
610
    "movq              %%mm2, %%mm0     \n\t"\
611
    "movq              %%mm5, %%mm6     \n\t"\
612
    "movq              %%mm4, %%mm3     \n\t"\
613
    "punpcklwd         %%mm2, %%mm2     \n\t"\
614
    "punpcklwd         %%mm5, %%mm5     \n\t"\
615
    "punpcklwd         %%mm4, %%mm4     \n\t"\
616
    "paddw             %%mm1, %%mm2     \n\t"\
617
    "paddw             %%mm1, %%mm5     \n\t"\
618
    "paddw             %%mm1, %%mm4     \n\t"\
619
    "punpckhwd         %%mm0, %%mm0     \n\t"\
620
    "punpckhwd         %%mm6, %%mm6     \n\t"\
621
    "punpckhwd         %%mm3, %%mm3     \n\t"\
622
    "paddw             %%mm7, %%mm0     \n\t"\
623
    "paddw             %%mm7, %%mm6     \n\t"\
624
    "paddw             %%mm7, %%mm3     \n\t"\
625
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626
    "packuswb          %%mm0, %%mm2     \n\t"\
627
    "packuswb          %%mm6, %%mm5     \n\t"\
628
    "packuswb          %%mm3, %%mm4     \n\t"\
629

    
630
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
631

    
632
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
634
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
635
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
636
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
637
    "packuswb          %%mm1, %%mm7     \n\t"
638
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639

    
640
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641
    "movq       "#b", "#q2"     \n\t" /* B */\
642
    "movq       "#r", "#t"      \n\t" /* R */\
643
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
644
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
645
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
646
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
647
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
648
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
649
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
650
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
651
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
652
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
653
\
654
    MOVNTQ(   q0,   (dst, index, 4))\
655
    MOVNTQ(    b,  8(dst, index, 4))\
656
    MOVNTQ(   q2, 16(dst, index, 4))\
657
    MOVNTQ(   q3, 24(dst, index, 4))\
658
\
659
    "add      $8, "#index"      \n\t"\
660
    "cmp "#dstw", "#index"      \n\t"\
661
    " jb      1b                \n\t"
662
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663

    
664
#define REAL_WRITERGB16(dst, dstw, index) \
665
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
666
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
667
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
668
    "psrlq           $3, %%mm2  \n\t"\
669
\
670
    "movq         %%mm2, %%mm1  \n\t"\
671
    "movq         %%mm4, %%mm3  \n\t"\
672
\
673
    "punpcklbw    %%mm7, %%mm3  \n\t"\
674
    "punpcklbw    %%mm5, %%mm2  \n\t"\
675
    "punpckhbw    %%mm7, %%mm4  \n\t"\
676
    "punpckhbw    %%mm5, %%mm1  \n\t"\
677
\
678
    "psllq           $3, %%mm3  \n\t"\
679
    "psllq           $3, %%mm4  \n\t"\
680
\
681
    "por          %%mm3, %%mm2  \n\t"\
682
    "por          %%mm4, %%mm1  \n\t"\
683
\
684
    MOVNTQ(%%mm2,  (dst, index, 2))\
685
    MOVNTQ(%%mm1, 8(dst, index, 2))\
686
\
687
    "add             $8, "#index"   \n\t"\
688
    "cmp        "#dstw", "#index"   \n\t"\
689
    " jb             1b             \n\t"
690
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
691

    
692
#define REAL_WRITERGB15(dst, dstw, index) \
693
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
694
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
695
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
696
    "psrlq           $3, %%mm2  \n\t"\
697
    "psrlq           $1, %%mm5  \n\t"\
698
\
699
    "movq         %%mm2, %%mm1  \n\t"\
700
    "movq         %%mm4, %%mm3  \n\t"\
701
\
702
    "punpcklbw    %%mm7, %%mm3  \n\t"\
703
    "punpcklbw    %%mm5, %%mm2  \n\t"\
704
    "punpckhbw    %%mm7, %%mm4  \n\t"\
705
    "punpckhbw    %%mm5, %%mm1  \n\t"\
706
\
707
    "psllq           $2, %%mm3  \n\t"\
708
    "psllq           $2, %%mm4  \n\t"\
709
\
710
    "por          %%mm3, %%mm2  \n\t"\
711
    "por          %%mm4, %%mm1  \n\t"\
712
\
713
    MOVNTQ(%%mm2,  (dst, index, 2))\
714
    MOVNTQ(%%mm1, 8(dst, index, 2))\
715
\
716
    "add             $8, "#index"   \n\t"\
717
    "cmp        "#dstw", "#index"   \n\t"\
718
    " jb             1b             \n\t"
719
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
720

    
721
#define WRITEBGR24OLD(dst, dstw, index) \
722
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723
    "movq      %%mm2, %%mm1             \n\t" /* B */\
724
    "movq      %%mm5, %%mm6             \n\t" /* R */\
725
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
726
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
727
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
728
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
729
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
730
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
731
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
732
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
733
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
734
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
735
\
736
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
737
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
738
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
739
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
740
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
741
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
742
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
743
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
744
\
745
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
746
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
747
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
748
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
749
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
750
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
751
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
752
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
753
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
754
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
755
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
756
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
757
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
758
\
759
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
760
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
761
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
762
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
763
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
764
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
765
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
766
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
767
\
768
    MOVNTQ(%%mm0,   (dst))\
769
    MOVNTQ(%%mm2,  8(dst))\
770
    MOVNTQ(%%mm3, 16(dst))\
771
    "add         $24, "#dst"            \n\t"\
772
\
773
    "add          $8, "#index"          \n\t"\
774
    "cmp     "#dstw", "#index"          \n\t"\
775
    " jb          1b                    \n\t"
776

    
777
#define WRITEBGR24MMX(dst, dstw, index) \
778
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779
    "movq      %%mm2, %%mm1     \n\t" /* B */\
780
    "movq      %%mm5, %%mm6     \n\t" /* R */\
781
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
782
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
783
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
784
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
785
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
786
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
787
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
788
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
789
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
790
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
791
\
792
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
793
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
794
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
795
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
796
\
797
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
798
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
799
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
800
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
801
\
802
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
803
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
804
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
805
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
806
\
807
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
808
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
809
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
810
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
811
    MOVNTQ(%%mm0, (dst))\
812
\
813
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
814
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
815
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
816
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
817
    MOVNTQ(%%mm6, 8(dst))\
818
\
819
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
820
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
821
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
822
    MOVNTQ(%%mm5, 16(dst))\
823
\
824
    "add         $24, "#dst"    \n\t"\
825
\
826
    "add          $8, "#index"  \n\t"\
827
    "cmp     "#dstw", "#index"  \n\t"\
828
    " jb          1b            \n\t"
829

    
830
#define WRITEBGR24MMX2(dst, dstw, index) \
831
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
835
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
836
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
837
\
838
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
839
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
840
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
841
\
842
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
843
    "por    %%mm1, %%mm6        \n\t"\
844
    "por    %%mm3, %%mm6        \n\t"\
845
    MOVNTQ(%%mm6, (dst))\
846
\
847
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
848
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
849
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
850
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
851
\
852
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
853
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
854
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
855
\
856
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
857
    "por    %%mm3, %%mm6        \n\t"\
858
    MOVNTQ(%%mm6, 8(dst))\
859
\
860
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
861
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
862
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
863
\
864
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
865
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
866
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
867
\
868
    "por    %%mm1, %%mm3        \n\t"\
869
    "por    %%mm3, %%mm6        \n\t"\
870
    MOVNTQ(%%mm6, 16(dst))\
871
\
872
    "add      $24, "#dst"       \n\t"\
873
\
874
    "add       $8, "#index"     \n\t"\
875
    "cmp  "#dstw", "#index"     \n\t"\
876
    " jb       1b               \n\t"
877

    
878
#if COMPILE_TEMPLATE_MMX2
879
#undef WRITEBGR24
880
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
881
#else
882
#undef WRITEBGR24
883
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
884
#endif
885

    
886
#define REAL_WRITEYUY2(dst, dstw, index) \
887
    "packuswb  %%mm3, %%mm3     \n\t"\
888
    "packuswb  %%mm4, %%mm4     \n\t"\
889
    "packuswb  %%mm7, %%mm1     \n\t"\
890
    "punpcklbw %%mm4, %%mm3     \n\t"\
891
    "movq      %%mm1, %%mm7     \n\t"\
892
    "punpcklbw %%mm3, %%mm1     \n\t"\
893
    "punpckhbw %%mm3, %%mm7     \n\t"\
894
\
895
    MOVNTQ(%%mm1, (dst, index, 2))\
896
    MOVNTQ(%%mm7, 8(dst, index, 2))\
897
\
898
    "add          $8, "#index"  \n\t"\
899
    "cmp     "#dstw", "#index"  \n\t"\
900
    " jb          1b            \n\t"
901
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
902

    
903

    
904
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907
{
908
#if COMPILE_TEMPLATE_MMX
909
    if(!(c->flags & SWS_BITEXACT)) {
910
        if (c->flags & SWS_ACCURATE_RND) {
911
            if (uDest) {
912
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914
            }
915
            if (CONFIG_SWSCALE_ALPHA && aDest) {
916
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917
            }
918

    
919
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920
        } else {
921
            if (uDest) {
922
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924
            }
925
            if (CONFIG_SWSCALE_ALPHA && aDest) {
926
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927
            }
928

    
929
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930
        }
931
        return;
932
    }
933
#endif
934
#if COMPILE_TEMPLATE_ALTIVEC
935
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936
                          chrFilter, chrSrc, chrFilterSize,
937
                          dest, uDest, vDest, dstW, chrDstW);
938
#else //COMPILE_TEMPLATE_ALTIVEC
939
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940
                chrFilter, chrSrc, chrFilterSize,
941
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942
#endif //!COMPILE_TEMPLATE_ALTIVEC
943
}
944

    
945
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
948
{
949
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950
                 chrFilter, chrSrc, chrFilterSize,
951
                 dest, uDest, dstW, chrDstW, dstFormat);
952
}
953

    
954
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956
{
957
    int i;
958
#if COMPILE_TEMPLATE_MMX
959
    if(!(c->flags & SWS_BITEXACT)) {
960
        long p= 4;
961
        uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964

    
965
        if (c->flags & SWS_ACCURATE_RND) {
966
            while(p--) {
967
                if (dst[p]) {
968
                    __asm__ volatile(
969
                        YSCALEYUV2YV121_ACCURATE
970
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
971
                        "g" (-counter[p])
972
                        : "%"REG_a
973
                    );
974
                }
975
            }
976
        } else {
977
            while(p--) {
978
                if (dst[p]) {
979
                    __asm__ volatile(
980
                        YSCALEYUV2YV121
981
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
982
                        "g" (-counter[p])
983
                        : "%"REG_a
984
                    );
985
                }
986
            }
987
        }
988
        return;
989
    }
990
#endif
991
    for (i=0; i<dstW; i++) {
992
        int val= (lumSrc[i]+64)>>7;
993

    
994
        if (val&256) {
995
            if (val<0) val=0;
996
            else       val=255;
997
        }
998

    
999
        dest[i]= val;
1000
    }
1001

    
1002
    if (uDest)
1003
        for (i=0; i<chrDstW; i++) {
1004
            int u=(chrSrc[i       ]+64)>>7;
1005
            int v=(chrSrc[i + VOFW]+64)>>7;
1006

    
1007
            if ((u|v)&256) {
1008
                if (u<0)        u=0;
1009
                else if (u>255) u=255;
1010
                if (v<0)        v=0;
1011
                else if (v>255) v=255;
1012
            }
1013

    
1014
            uDest[i]= u;
1015
            vDest[i]= v;
1016
        }
1017

    
1018
    if (CONFIG_SWSCALE_ALPHA && aDest)
1019
        for (i=0; i<dstW; i++) {
1020
            int val= (alpSrc[i]+64)>>7;
1021
            aDest[i]= av_clip_uint8(val);
1022
        }
1023
}
1024

    
1025

    
1026
/**
1027
 * vertical scale YV12 to RGB
1028
 */
1029
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1030
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1031
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1032
{
1033
#if COMPILE_TEMPLATE_MMX
1034
    x86_reg dummy=0;
1035
    if(!(c->flags & SWS_BITEXACT)) {
1036
        if (c->flags & SWS_ACCURATE_RND) {
1037
            switch(c->dstFormat) {
1038
            case PIX_FMT_RGB32:
1039
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1040
                    YSCALEYUV2PACKEDX_ACCURATE
1041
                    YSCALEYUV2RGBX
1042
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1043
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1044
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1045
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1046
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1047
                    "psraw                        $3, %%mm1         \n\t"
1048
                    "psraw                        $3, %%mm7         \n\t"
1049
                    "packuswb                  %%mm7, %%mm1         \n\t"
1050
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1051

    
1052
                    YSCALEYUV2PACKEDX_END
1053
                } else {
1054
                    YSCALEYUV2PACKEDX_ACCURATE
1055
                    YSCALEYUV2RGBX
1056
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1057
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1058

    
1059
                    YSCALEYUV2PACKEDX_END
1060
                }
1061
                return;
1062
            case PIX_FMT_BGR24:
1063
                YSCALEYUV2PACKEDX_ACCURATE
1064
                YSCALEYUV2RGBX
1065
                "pxor %%mm7, %%mm7 \n\t"
1066
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1067
                "add %4, %%"REG_c"                        \n\t"
1068
                WRITEBGR24(%%REGc, %5, %%REGa)
1069

    
1070

    
1071
                :: "r" (&c->redDither),
1072
                "m" (dummy), "m" (dummy), "m" (dummy),
1073
                "r" (dest), "m" (dstW)
1074
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1075
                );
1076
                return;
1077
            case PIX_FMT_RGB555:
1078
                YSCALEYUV2PACKEDX_ACCURATE
1079
                YSCALEYUV2RGBX
1080
                "pxor %%mm7, %%mm7 \n\t"
1081
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082
#ifdef DITHER1XBPP
1083
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1084
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1085
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1086
#endif
1087

    
1088
                WRITERGB15(%4, %5, %%REGa)
1089
                YSCALEYUV2PACKEDX_END
1090
                return;
1091
            case PIX_FMT_RGB565:
1092
                YSCALEYUV2PACKEDX_ACCURATE
1093
                YSCALEYUV2RGBX
1094
                "pxor %%mm7, %%mm7 \n\t"
1095
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1096
#ifdef DITHER1XBPP
1097
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1098
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1099
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1100
#endif
1101

    
1102
                WRITERGB16(%4, %5, %%REGa)
1103
                YSCALEYUV2PACKEDX_END
1104
                return;
1105
            case PIX_FMT_YUYV422:
1106
                YSCALEYUV2PACKEDX_ACCURATE
1107
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1108

    
1109
                "psraw $3, %%mm3    \n\t"
1110
                "psraw $3, %%mm4    \n\t"
1111
                "psraw $3, %%mm1    \n\t"
1112
                "psraw $3, %%mm7    \n\t"
1113
                WRITEYUY2(%4, %5, %%REGa)
1114
                YSCALEYUV2PACKEDX_END
1115
                return;
1116
            }
1117
        } else {
1118
            switch(c->dstFormat) {
1119
            case PIX_FMT_RGB32:
1120
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1121
                    YSCALEYUV2PACKEDX
1122
                    YSCALEYUV2RGBX
1123
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1124
                    "psraw                        $3, %%mm1         \n\t"
1125
                    "psraw                        $3, %%mm7         \n\t"
1126
                    "packuswb                  %%mm7, %%mm1         \n\t"
1127
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1128
                    YSCALEYUV2PACKEDX_END
1129
                } else {
1130
                    YSCALEYUV2PACKEDX
1131
                    YSCALEYUV2RGBX
1132
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1133
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134
                    YSCALEYUV2PACKEDX_END
1135
                }
1136
                return;
1137
            case PIX_FMT_BGR24:
1138
                YSCALEYUV2PACKEDX
1139
                YSCALEYUV2RGBX
1140
                "pxor                    %%mm7, %%mm7       \n\t"
1141
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1142
                "add                        %4, %%"REG_c"   \n\t"
1143
                WRITEBGR24(%%REGc, %5, %%REGa)
1144

    
1145
                :: "r" (&c->redDither),
1146
                "m" (dummy), "m" (dummy), "m" (dummy),
1147
                "r" (dest),  "m" (dstW)
1148
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1149
                );
1150
                return;
1151
            case PIX_FMT_RGB555:
1152
                YSCALEYUV2PACKEDX
1153
                YSCALEYUV2RGBX
1154
                "pxor %%mm7, %%mm7 \n\t"
1155
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1156
#ifdef DITHER1XBPP
1157
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1158
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1159
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1160
#endif
1161

    
1162
                WRITERGB15(%4, %5, %%REGa)
1163
                YSCALEYUV2PACKEDX_END
1164
                return;
1165
            case PIX_FMT_RGB565:
1166
                YSCALEYUV2PACKEDX
1167
                YSCALEYUV2RGBX
1168
                "pxor %%mm7, %%mm7 \n\t"
1169
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1170
#ifdef DITHER1XBPP
1171
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1172
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1173
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1174
#endif
1175

    
1176
                WRITERGB16(%4, %5, %%REGa)
1177
                YSCALEYUV2PACKEDX_END
1178
                return;
1179
            case PIX_FMT_YUYV422:
1180
                YSCALEYUV2PACKEDX
1181
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1182

    
1183
                "psraw $3, %%mm3    \n\t"
1184
                "psraw $3, %%mm4    \n\t"
1185
                "psraw $3, %%mm1    \n\t"
1186
                "psraw $3, %%mm7    \n\t"
1187
                WRITEYUY2(%4, %5, %%REGa)
1188
                YSCALEYUV2PACKEDX_END
1189
                return;
1190
            }
1191
        }
1192
    }
1193
#endif /* COMPILE_TEMPLATE_MMX */
1194
#if COMPILE_TEMPLATE_ALTIVEC
1195
    /* The following list of supported dstFormat values should
1196
       match what's found in the body of ff_yuv2packedX_altivec() */
1197
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1198
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1199
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1200
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1201
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1202
                                   chrFilter, chrSrc, chrFilterSize,
1203
                                   dest, dstW, dstY);
1204
    else
1205
#endif
1206
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1207
                       chrFilter, chrSrc, chrFilterSize,
1208
                       alpSrc, dest, dstW, dstY);
1209
}
1210

    
1211
/**
1212
 * vertical bilinear scale YV12 to RGB
1213
 */
1214
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1215
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1216
{
1217
    int  yalpha1=4095- yalpha;
1218
    int uvalpha1=4095-uvalpha;
1219
    int i;
1220

    
1221
#if COMPILE_TEMPLATE_MMX
1222
    if(!(c->flags & SWS_BITEXACT)) {
1223
        switch(c->dstFormat) {
1224
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1225
        case PIX_FMT_RGB32:
1226
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1227
#if ARCH_X86_64
1228
                __asm__ volatile(
1229
                    YSCALEYUV2RGB(%%REGBP, %5)
1230
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1231
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1232
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1233
                    "packuswb            %%mm7, %%mm1       \n\t"
1234
                    WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1235

    
1236
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1237
                    "a" (&c->redDither)
1238
                    ,"r" (abuf0), "r" (abuf1)
1239
                    : "%"REG_BP
1240
                );
1241
#else
1242
                *(uint16_t **)(&c->u_temp)=abuf0;
1243
                *(uint16_t **)(&c->v_temp)=abuf1;
1244
                __asm__ volatile(
1245
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1246
                    "mov        %4, %%"REG_b"               \n\t"
1247
                    "push %%"REG_BP"                        \n\t"
1248
                    YSCALEYUV2RGB(%%REGBP, %5)
1249
                    "push                   %0              \n\t"
1250
                    "push                   %1              \n\t"
1251
                    "mov          "U_TEMP"(%5), %0          \n\t"
1252
                    "mov          "V_TEMP"(%5), %1          \n\t"
1253
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1254
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1255
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1256
                    "packuswb            %%mm7, %%mm1       \n\t"
1257
                    "pop                    %1              \n\t"
1258
                    "pop                    %0              \n\t"
1259
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1260
                    "pop %%"REG_BP"                         \n\t"
1261
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1262

    
1263
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1264
                    "a" (&c->redDither)
1265
                );
1266
#endif
1267
            } else {
1268
                __asm__ volatile(
1269
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1270
                    "mov        %4, %%"REG_b"               \n\t"
1271
                    "push %%"REG_BP"                        \n\t"
1272
                    YSCALEYUV2RGB(%%REGBP, %5)
1273
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1274
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1275
                    "pop %%"REG_BP"                         \n\t"
1276
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1277

    
1278
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1279
                    "a" (&c->redDither)
1280
                );
1281
            }
1282
            return;
1283
        case PIX_FMT_BGR24:
1284
            __asm__ volatile(
1285
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1286
                "mov        %4, %%"REG_b"               \n\t"
1287
                "push %%"REG_BP"                        \n\t"
1288
                YSCALEYUV2RGB(%%REGBP, %5)
1289
                "pxor    %%mm7, %%mm7                   \n\t"
1290
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1291
                "pop %%"REG_BP"                         \n\t"
1292
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1293
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1294
                "a" (&c->redDither)
1295
            );
1296
            return;
1297
        case PIX_FMT_RGB555:
1298
            __asm__ volatile(
1299
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1300
                "mov        %4, %%"REG_b"               \n\t"
1301
                "push %%"REG_BP"                        \n\t"
1302
                YSCALEYUV2RGB(%%REGBP, %5)
1303
                "pxor    %%mm7, %%mm7                   \n\t"
1304
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1305
#ifdef DITHER1XBPP
1306
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1307
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1308
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1309
#endif
1310

    
1311
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1312
                "pop %%"REG_BP"                         \n\t"
1313
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1314

    
1315
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1316
                "a" (&c->redDither)
1317
            );
1318
            return;
1319
        case PIX_FMT_RGB565:
1320
            __asm__ volatile(
1321
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1322
                "mov        %4, %%"REG_b"               \n\t"
1323
                "push %%"REG_BP"                        \n\t"
1324
                YSCALEYUV2RGB(%%REGBP, %5)
1325
                "pxor    %%mm7, %%mm7                   \n\t"
1326
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327
#ifdef DITHER1XBPP
1328
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1329
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1330
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1331
#endif
1332

    
1333
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1334
                "pop %%"REG_BP"                         \n\t"
1335
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1336
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1337
                "a" (&c->redDither)
1338
            );
1339
            return;
1340
        case PIX_FMT_YUYV422:
1341
            __asm__ volatile(
1342
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1343
                "mov %4, %%"REG_b"                        \n\t"
1344
                "push %%"REG_BP"                        \n\t"
1345
                YSCALEYUV2PACKED(%%REGBP, %5)
1346
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1347
                "pop %%"REG_BP"                         \n\t"
1348
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1349
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1350
                "a" (&c->redDither)
1351
            );
1352
            return;
1353
        default: break;
1354
        }
1355
    }
1356
#endif //COMPILE_TEMPLATE_MMX
1357
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1358
}
1359

    
1360
/**
1361
 * YV12 to RGB without scaling or interpolating
1362
 */
1363
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1364
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1365
{
1366
    const int yalpha1=0;
1367
    int i;
1368

    
1369
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1370
    const int yalpha= 4096; //FIXME ...
1371

    
1372
    if (flags&SWS_FULL_CHR_H_INT) {
1373
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1374
        return;
1375
    }
1376

    
1377
#if COMPILE_TEMPLATE_MMX
1378
    if(!(flags & SWS_BITEXACT)) {
1379
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380
            switch(dstFormat) {
1381
            case PIX_FMT_RGB32:
1382
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1383
                    __asm__ volatile(
1384
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1385
                        "mov        %4, %%"REG_b"               \n\t"
1386
                        "push %%"REG_BP"                        \n\t"
1387
                        YSCALEYUV2RGB1(%%REGBP, %5)
1388
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1389
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1390
                        "pop %%"REG_BP"                         \n\t"
1391
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1392

    
1393
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1394
                        "a" (&c->redDither)
1395
                    );
1396
                } else {
1397
                    __asm__ volatile(
1398
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1399
                        "mov        %4, %%"REG_b"               \n\t"
1400
                        "push %%"REG_BP"                        \n\t"
1401
                        YSCALEYUV2RGB1(%%REGBP, %5)
1402
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1403
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1404
                        "pop %%"REG_BP"                         \n\t"
1405
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1406

    
1407
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1408
                        "a" (&c->redDither)
1409
                    );
1410
                }
1411
                return;
1412
            case PIX_FMT_BGR24:
1413
                __asm__ volatile(
1414
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1415
                    "mov        %4, %%"REG_b"               \n\t"
1416
                    "push %%"REG_BP"                        \n\t"
1417
                    YSCALEYUV2RGB1(%%REGBP, %5)
1418
                    "pxor    %%mm7, %%mm7                   \n\t"
1419
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1420
                    "pop %%"REG_BP"                         \n\t"
1421
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1422

    
1423
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1424
                    "a" (&c->redDither)
1425
                );
1426
                return;
1427
            case PIX_FMT_RGB555:
1428
                __asm__ volatile(
1429
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1430
                    "mov        %4, %%"REG_b"               \n\t"
1431
                    "push %%"REG_BP"                        \n\t"
1432
                    YSCALEYUV2RGB1(%%REGBP, %5)
1433
                    "pxor    %%mm7, %%mm7                   \n\t"
1434
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1435
#ifdef DITHER1XBPP
1436
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1437
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1438
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1439
#endif
1440
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1441
                    "pop %%"REG_BP"                         \n\t"
1442
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1443

    
1444
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1445
                    "a" (&c->redDither)
1446
                );
1447
                return;
1448
            case PIX_FMT_RGB565:
1449
                __asm__ volatile(
1450
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1451
                    "mov        %4, %%"REG_b"               \n\t"
1452
                    "push %%"REG_BP"                        \n\t"
1453
                    YSCALEYUV2RGB1(%%REGBP, %5)
1454
                    "pxor    %%mm7, %%mm7                   \n\t"
1455
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1456
#ifdef DITHER1XBPP
1457
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1458
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1459
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1460
#endif
1461

    
1462
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1463
                    "pop %%"REG_BP"                         \n\t"
1464
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1465

    
1466
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1467
                    "a" (&c->redDither)
1468
                );
1469
                return;
1470
            case PIX_FMT_YUYV422:
1471
                __asm__ volatile(
1472
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1473
                    "mov        %4, %%"REG_b"               \n\t"
1474
                    "push %%"REG_BP"                        \n\t"
1475
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1476
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1477
                    "pop %%"REG_BP"                         \n\t"
1478
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1479

    
1480
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1481
                    "a" (&c->redDither)
1482
                );
1483
                return;
1484
            }
1485
        } else {
1486
            switch(dstFormat) {
1487
            case PIX_FMT_RGB32:
1488
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1489
                    __asm__ volatile(
1490
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1491
                        "mov        %4, %%"REG_b"               \n\t"
1492
                        "push %%"REG_BP"                        \n\t"
1493
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1494
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1495
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1496
                        "pop %%"REG_BP"                         \n\t"
1497
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1498

    
1499
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1500
                        "a" (&c->redDither)
1501
                    );
1502
                } else {
1503
                    __asm__ volatile(
1504
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1505
                        "mov        %4, %%"REG_b"               \n\t"
1506
                        "push %%"REG_BP"                        \n\t"
1507
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1508
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1509
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1510
                        "pop %%"REG_BP"                         \n\t"
1511
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1512

    
1513
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1514
                        "a" (&c->redDither)
1515
                    );
1516
                }
1517
                return;
1518
            case PIX_FMT_BGR24:
1519
                __asm__ volatile(
1520
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1521
                    "mov        %4, %%"REG_b"               \n\t"
1522
                    "push %%"REG_BP"                        \n\t"
1523
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1524
                    "pxor    %%mm7, %%mm7                   \n\t"
1525
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1526
                    "pop %%"REG_BP"                         \n\t"
1527
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1528

    
1529
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1530
                    "a" (&c->redDither)
1531
                );
1532
                return;
1533
            case PIX_FMT_RGB555:
1534
                __asm__ volatile(
1535
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1536
                    "mov        %4, %%"REG_b"               \n\t"
1537
                    "push %%"REG_BP"                        \n\t"
1538
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1539
                    "pxor    %%mm7, %%mm7                   \n\t"
1540
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1541
#ifdef DITHER1XBPP
1542
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1543
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1544
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1545
#endif
1546
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1547
                    "pop %%"REG_BP"                         \n\t"
1548
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1549

    
1550
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551
                    "a" (&c->redDither)
1552
                );
1553
                return;
1554
            case PIX_FMT_RGB565:
1555
                __asm__ volatile(
1556
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1557
                    "mov        %4, %%"REG_b"               \n\t"
1558
                    "push %%"REG_BP"                        \n\t"
1559
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1560
                    "pxor    %%mm7, %%mm7                   \n\t"
1561
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1562
#ifdef DITHER1XBPP
1563
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1564
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1565
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1566
#endif
1567

    
1568
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1569
                    "pop %%"REG_BP"                         \n\t"
1570
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1571

    
1572
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1573
                    "a" (&c->redDither)
1574
                );
1575
                return;
1576
            case PIX_FMT_YUYV422:
1577
                __asm__ volatile(
1578
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1579
                    "mov        %4, %%"REG_b"               \n\t"
1580
                    "push %%"REG_BP"                        \n\t"
1581
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1582
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1583
                    "pop %%"REG_BP"                         \n\t"
1584
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1585

    
1586
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1587
                    "a" (&c->redDither)
1588
                );
1589
                return;
1590
            }
1591
        }
1592
    }
1593
#endif /* COMPILE_TEMPLATE_MMX */
1594
    if (uvalpha < 2048) {
1595
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1596
    } else {
1597
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1598
    }
1599
}
1600

    
1601
//FIXME yuy2* can read up to 7 samples too much
1602

    
1603
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1604
{
1605
#if COMPILE_TEMPLATE_MMX
1606
    __asm__ volatile(
1607
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1608
        "mov                    %0, %%"REG_a"       \n\t"
1609
        "1:                                         \n\t"
1610
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1611
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1612
        "pand                %%mm2, %%mm0           \n\t"
1613
        "pand                %%mm2, %%mm1           \n\t"
1614
        "packuswb            %%mm1, %%mm0           \n\t"
1615
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1616
        "add                    $8, %%"REG_a"       \n\t"
1617
        " js                    1b                  \n\t"
1618
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1619
        : "%"REG_a
1620
    );
1621
#else
1622
    int i;
1623
    for (i=0; i<width; i++)
1624
        dst[i]= src[2*i];
1625
#endif
1626
}
1627

    
1628
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1629
{
1630
#if COMPILE_TEMPLATE_MMX
1631
    __asm__ volatile(
1632
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1633
        "mov                    %0, %%"REG_a"       \n\t"
1634
        "1:                                         \n\t"
1635
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1636
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1637
        "psrlw                  $8, %%mm0           \n\t"
1638
        "psrlw                  $8, %%mm1           \n\t"
1639
        "packuswb            %%mm1, %%mm0           \n\t"
1640
        "movq                %%mm0, %%mm1           \n\t"
1641
        "psrlw                  $8, %%mm0           \n\t"
1642
        "pand                %%mm4, %%mm1           \n\t"
1643
        "packuswb            %%mm0, %%mm0           \n\t"
1644
        "packuswb            %%mm1, %%mm1           \n\t"
1645
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1646
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1647
        "add                    $4, %%"REG_a"       \n\t"
1648
        " js                    1b                  \n\t"
1649
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1650
        : "%"REG_a
1651
    );
1652
#else
1653
    int i;
1654
    for (i=0; i<width; i++) {
1655
        dstU[i]= src1[4*i + 1];
1656
        dstV[i]= src1[4*i + 3];
1657
    }
1658
#endif
1659
    assert(src1 == src2);
1660
}
1661

    
1662
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1663
{
1664
#if COMPILE_TEMPLATE_MMX
1665
    __asm__ volatile(
1666
        "mov                    %0, %%"REG_a"       \n\t"
1667
        "1:                                         \n\t"
1668
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1669
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1670
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1671
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1672
        "psrlw                  $8, %%mm0           \n\t"
1673
        "psrlw                  $8, %%mm1           \n\t"
1674
        "psrlw                  $8, %%mm2           \n\t"
1675
        "psrlw                  $8, %%mm3           \n\t"
1676
        "packuswb            %%mm1, %%mm0           \n\t"
1677
        "packuswb            %%mm3, %%mm2           \n\t"
1678
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1679
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1680
        "add                    $8, %%"REG_a"       \n\t"
1681
        " js                    1b                  \n\t"
1682
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1683
        : "%"REG_a
1684
    );
1685
#else
1686
    int i;
1687
    for (i=0; i<width; i++) {
1688
        dstU[i]= src1[2*i + 1];
1689
        dstV[i]= src2[2*i + 1];
1690
    }
1691
#endif
1692
}
1693

    
1694
/* This is almost identical to the previous, end exists only because
1695
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1696
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1697
{
1698
#if COMPILE_TEMPLATE_MMX
1699
    __asm__ volatile(
1700
        "mov                  %0, %%"REG_a"         \n\t"
1701
        "1:                                         \n\t"
1702
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1703
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1704
        "psrlw                $8, %%mm0             \n\t"
1705
        "psrlw                $8, %%mm1             \n\t"
1706
        "packuswb          %%mm1, %%mm0             \n\t"
1707
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1708
        "add                  $8, %%"REG_a"         \n\t"
1709
        " js                  1b                    \n\t"
1710
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1711
        : "%"REG_a
1712
    );
1713
#else
1714
    int i;
1715
    for (i=0; i<width; i++)
1716
        dst[i]= src[2*i+1];
1717
#endif
1718
}
1719

    
1720
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1721
{
1722
#if COMPILE_TEMPLATE_MMX
1723
    __asm__ volatile(
1724
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1725
        "mov                    %0, %%"REG_a"       \n\t"
1726
        "1:                                         \n\t"
1727
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1728
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1729
        "pand                %%mm4, %%mm0           \n\t"
1730
        "pand                %%mm4, %%mm1           \n\t"
1731
        "packuswb            %%mm1, %%mm0           \n\t"
1732
        "movq                %%mm0, %%mm1           \n\t"
1733
        "psrlw                  $8, %%mm0           \n\t"
1734
        "pand                %%mm4, %%mm1           \n\t"
1735
        "packuswb            %%mm0, %%mm0           \n\t"
1736
        "packuswb            %%mm1, %%mm1           \n\t"
1737
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1738
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1739
        "add                    $4, %%"REG_a"       \n\t"
1740
        " js                    1b                  \n\t"
1741
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1742
        : "%"REG_a
1743
    );
1744
#else
1745
    int i;
1746
    for (i=0; i<width; i++) {
1747
        dstU[i]= src1[4*i + 0];
1748
        dstV[i]= src1[4*i + 2];
1749
    }
1750
#endif
1751
    assert(src1 == src2);
1752
}
1753

    
1754
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1755
{
1756
#if COMPILE_TEMPLATE_MMX
1757
    __asm__ volatile(
1758
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1759
        "mov                    %0, %%"REG_a"       \n\t"
1760
        "1:                                         \n\t"
1761
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1762
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1763
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1764
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1765
        "pand                %%mm4, %%mm0           \n\t"
1766
        "pand                %%mm4, %%mm1           \n\t"
1767
        "pand                %%mm4, %%mm2           \n\t"
1768
        "pand                %%mm4, %%mm3           \n\t"
1769
        "packuswb            %%mm1, %%mm0           \n\t"
1770
        "packuswb            %%mm3, %%mm2           \n\t"
1771
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1772
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1773
        "add                    $8, %%"REG_a"       \n\t"
1774
        " js                    1b                  \n\t"
1775
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1776
        : "%"REG_a
1777
    );
1778
#else
1779
    int i;
1780
    for (i=0; i<width; i++) {
1781
        dstU[i]= src1[2*i];
1782
        dstV[i]= src2[2*i];
1783
    }
1784
#endif
1785
}
1786

    
1787
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1788
                                    const uint8_t *src, long width)
1789
{
1790
#if COMPILE_TEMPLATE_MMX
1791
    __asm__ volatile(
1792
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1793
        "mov                    %0, %%"REG_a"       \n\t"
1794
        "1:                                         \n\t"
1795
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1796
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1797
        "movq                %%mm0, %%mm2           \n\t"
1798
        "movq                %%mm1, %%mm3           \n\t"
1799
        "pand                %%mm4, %%mm0           \n\t"
1800
        "pand                %%mm4, %%mm1           \n\t"
1801
        "psrlw                  $8, %%mm2           \n\t"
1802
        "psrlw                  $8, %%mm3           \n\t"
1803
        "packuswb            %%mm1, %%mm0           \n\t"
1804
        "packuswb            %%mm3, %%mm2           \n\t"
1805
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1806
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1807
        "add                    $8, %%"REG_a"       \n\t"
1808
        " js                    1b                  \n\t"
1809
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1810
        : "%"REG_a
1811
    );
1812
#else
1813
    int i;
1814
    for (i = 0; i < width; i++) {
1815
        dst1[i] = src[2*i+0];
1816
        dst2[i] = src[2*i+1];
1817
    }
1818
#endif
1819
}
1820

    
1821
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822
                                    const uint8_t *src1, const uint8_t *src2,
1823
                                    long width, uint32_t *unused)
1824
{
1825
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1826
}
1827

    
1828
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1829
                                    const uint8_t *src1, const uint8_t *src2,
1830
                                    long width, uint32_t *unused)
1831
{
1832
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1833
}
1834

    
1835
#if COMPILE_TEMPLATE_MMX
1836
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1837
{
1838

    
1839
    if(srcFormat == PIX_FMT_BGR24) {
1840
        __asm__ volatile(
1841
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1842
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1843
            :
1844
        );
1845
    } else {
1846
        __asm__ volatile(
1847
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1848
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1849
            :
1850
        );
1851
    }
1852

    
1853
    __asm__ volatile(
1854
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1855
        "mov                        %2, %%"REG_a"   \n\t"
1856
        "pxor                    %%mm7, %%mm7       \n\t"
1857
        "1:                                         \n\t"
1858
        PREFETCH"               64(%0)              \n\t"
1859
        "movd                     (%0), %%mm0       \n\t"
1860
        "movd                    2(%0), %%mm1       \n\t"
1861
        "movd                    6(%0), %%mm2       \n\t"
1862
        "movd                    8(%0), %%mm3       \n\t"
1863
        "add                       $12, %0          \n\t"
1864
        "punpcklbw               %%mm7, %%mm0       \n\t"
1865
        "punpcklbw               %%mm7, %%mm1       \n\t"
1866
        "punpcklbw               %%mm7, %%mm2       \n\t"
1867
        "punpcklbw               %%mm7, %%mm3       \n\t"
1868
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1869
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1870
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1871
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1872
        "paddd                   %%mm1, %%mm0       \n\t"
1873
        "paddd                   %%mm3, %%mm2       \n\t"
1874
        "paddd                   %%mm4, %%mm0       \n\t"
1875
        "paddd                   %%mm4, %%mm2       \n\t"
1876
        "psrad                     $15, %%mm0       \n\t"
1877
        "psrad                     $15, %%mm2       \n\t"
1878
        "packssdw                %%mm2, %%mm0       \n\t"
1879
        "packuswb                %%mm0, %%mm0       \n\t"
1880
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1881
        "add                        $4, %%"REG_a"   \n\t"
1882
        " js                        1b              \n\t"
1883
    : "+r" (src)
1884
    : "r" (dst+width), "g" ((x86_reg)-width)
1885
    : "%"REG_a
1886
    );
1887
}
1888

    
1889
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1890
{
1891
    __asm__ volatile(
1892
        "movq                    24+%4, %%mm6       \n\t"
1893
        "mov                        %3, %%"REG_a"   \n\t"
1894
        "pxor                    %%mm7, %%mm7       \n\t"
1895
        "1:                                         \n\t"
1896
        PREFETCH"               64(%0)              \n\t"
1897
        "movd                     (%0), %%mm0       \n\t"
1898
        "movd                    2(%0), %%mm1       \n\t"
1899
        "punpcklbw               %%mm7, %%mm0       \n\t"
1900
        "punpcklbw               %%mm7, %%mm1       \n\t"
1901
        "movq                    %%mm0, %%mm2       \n\t"
1902
        "movq                    %%mm1, %%mm3       \n\t"
1903
        "pmaddwd                    %4, %%mm0       \n\t"
1904
        "pmaddwd                  8+%4, %%mm1       \n\t"
1905
        "pmaddwd                 16+%4, %%mm2       \n\t"
1906
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1907
        "paddd                   %%mm1, %%mm0       \n\t"
1908
        "paddd                   %%mm3, %%mm2       \n\t"
1909

    
1910
        "movd                    6(%0), %%mm1       \n\t"
1911
        "movd                    8(%0), %%mm3       \n\t"
1912
        "add                       $12, %0          \n\t"
1913
        "punpcklbw               %%mm7, %%mm1       \n\t"
1914
        "punpcklbw               %%mm7, %%mm3       \n\t"
1915
        "movq                    %%mm1, %%mm4       \n\t"
1916
        "movq                    %%mm3, %%mm5       \n\t"
1917
        "pmaddwd                    %4, %%mm1       \n\t"
1918
        "pmaddwd                  8+%4, %%mm3       \n\t"
1919
        "pmaddwd                 16+%4, %%mm4       \n\t"
1920
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1921
        "paddd                   %%mm3, %%mm1       \n\t"
1922
        "paddd                   %%mm5, %%mm4       \n\t"
1923

    
1924
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1925
        "paddd                   %%mm3, %%mm0       \n\t"
1926
        "paddd                   %%mm3, %%mm2       \n\t"
1927
        "paddd                   %%mm3, %%mm1       \n\t"
1928
        "paddd                   %%mm3, %%mm4       \n\t"
1929
        "psrad                     $15, %%mm0       \n\t"
1930
        "psrad                     $15, %%mm2       \n\t"
1931
        "psrad                     $15, %%mm1       \n\t"
1932
        "psrad                     $15, %%mm4       \n\t"
1933
        "packssdw                %%mm1, %%mm0       \n\t"
1934
        "packssdw                %%mm4, %%mm2       \n\t"
1935
        "packuswb                %%mm0, %%mm0       \n\t"
1936
        "packuswb                %%mm2, %%mm2       \n\t"
1937
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1938
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1939
        "add                        $4, %%"REG_a"   \n\t"
1940
        " js                        1b              \n\t"
1941
    : "+r" (src)
1942
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1943
    : "%"REG_a
1944
    );
1945
}
1946
#endif
1947

    
1948
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1949
{
1950
#if COMPILE_TEMPLATE_MMX
1951
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1952
#else
1953
    int i;
1954
    for (i=0; i<width; i++) {
1955
        int b= src[i*3+0];
1956
        int g= src[i*3+1];
1957
        int r= src[i*3+2];
1958

    
1959
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1960
    }
1961
#endif /* COMPILE_TEMPLATE_MMX */
1962
}
1963

    
1964
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1965
{
1966
#if COMPILE_TEMPLATE_MMX
1967
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1968
#else
1969
    int i;
1970
    for (i=0; i<width; i++) {
1971
        int b= src1[3*i + 0];
1972
        int g= src1[3*i + 1];
1973
        int r= src1[3*i + 2];
1974

    
1975
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1976
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1977
    }
1978
#endif /* COMPILE_TEMPLATE_MMX */
1979
    assert(src1 == src2);
1980
}
1981

    
1982
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1983
{
1984
    int i;
1985
    for (i=0; i<width; i++) {
1986
        int b= src1[6*i + 0] + src1[6*i + 3];
1987
        int g= src1[6*i + 1] + src1[6*i + 4];
1988
        int r= src1[6*i + 2] + src1[6*i + 5];
1989

    
1990
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1991
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1992
    }
1993
    assert(src1 == src2);
1994
}
1995

    
1996
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1997
{
1998
#if COMPILE_TEMPLATE_MMX
1999
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
2000
#else
2001
    int i;
2002
    for (i=0; i<width; i++) {
2003
        int r= src[i*3+0];
2004
        int g= src[i*3+1];
2005
        int b= src[i*3+2];
2006

    
2007
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2008
    }
2009
#endif
2010
}
2011

    
2012
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2013
{
2014
#if COMPILE_TEMPLATE_MMX
2015
    assert(src1==src2);
2016
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2017
#else
2018
    int i;
2019
    assert(src1==src2);
2020
    for (i=0; i<width; i++) {
2021
        int r= src1[3*i + 0];
2022
        int g= src1[3*i + 1];
2023
        int b= src1[3*i + 2];
2024

    
2025
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2026
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2027
    }
2028
#endif
2029
}
2030

    
2031
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2032
{
2033
    int i;
2034
    assert(src1==src2);
2035
    for (i=0; i<width; i++) {
2036
        int r= src1[6*i + 0] + src1[6*i + 3];
2037
        int g= src1[6*i + 1] + src1[6*i + 4];
2038
        int b= src1[6*i + 2] + src1[6*i + 5];
2039

    
2040
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2041
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2042
    }
2043
}
2044

    
2045

    
2046
// bilinear / bicubic scaling
2047
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2048
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2049
{
2050
#if COMPILE_TEMPLATE_MMX
2051
    assert(filterSize % 4 == 0 && filterSize>0);
2052
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2053
        x86_reg counter= -2*dstW;
2054
        filter-= counter*2;
2055
        filterPos-= counter/2;
2056
        dst-= counter/2;
2057
        __asm__ volatile(
2058
#if defined(PIC)
2059
            "push            %%"REG_b"              \n\t"
2060
#endif
2061
            "pxor                %%mm7, %%mm7       \n\t"
2062
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2063
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2064
            ASMALIGN(4)
2065
            "1:                                     \n\t"
2066
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2067
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2068
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2069
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2070
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2071
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2072
            "punpcklbw           %%mm7, %%mm0       \n\t"
2073
            "punpcklbw           %%mm7, %%mm2       \n\t"
2074
            "pmaddwd             %%mm1, %%mm0       \n\t"
2075
            "pmaddwd             %%mm2, %%mm3       \n\t"
2076
            "movq                %%mm0, %%mm4       \n\t"
2077
            "punpckldq           %%mm3, %%mm0       \n\t"
2078
            "punpckhdq           %%mm3, %%mm4       \n\t"
2079
            "paddd               %%mm4, %%mm0       \n\t"
2080
            "psrad                  $7, %%mm0       \n\t"
2081
            "packssdw            %%mm0, %%mm0       \n\t"
2082
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2083
            "add                    $4, %%"REG_BP"  \n\t"
2084
            " jnc                   1b              \n\t"
2085

    
2086
            "pop            %%"REG_BP"              \n\t"
2087
#if defined(PIC)
2088
            "pop             %%"REG_b"              \n\t"
2089
#endif
2090
            : "+a" (counter)
2091
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2092
#if !defined(PIC)
2093
            : "%"REG_b
2094
#endif
2095
        );
2096
    } else if (filterSize==8) {
2097
        x86_reg counter= -2*dstW;
2098
        filter-= counter*4;
2099
        filterPos-= counter/2;
2100
        dst-= counter/2;
2101
        __asm__ volatile(
2102
#if defined(PIC)
2103
            "push             %%"REG_b"             \n\t"
2104
#endif
2105
            "pxor                 %%mm7, %%mm7      \n\t"
2106
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2107
            "mov              %%"REG_a", %%"REG_BP" \n\t"
2108
            ASMALIGN(4)
2109
            "1:                                     \n\t"
2110
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2111
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2112
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2113
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2114
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2115
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2116
            "punpcklbw            %%mm7, %%mm0      \n\t"
2117
            "punpcklbw            %%mm7, %%mm2      \n\t"
2118
            "pmaddwd              %%mm1, %%mm0      \n\t"
2119
            "pmaddwd              %%mm2, %%mm3      \n\t"
2120

    
2121
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2122
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2123
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2124
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2125
            "punpcklbw            %%mm7, %%mm4      \n\t"
2126
            "punpcklbw            %%mm7, %%mm2      \n\t"
2127
            "pmaddwd              %%mm1, %%mm4      \n\t"
2128
            "pmaddwd              %%mm2, %%mm5      \n\t"
2129
            "paddd                %%mm4, %%mm0      \n\t"
2130
            "paddd                %%mm5, %%mm3      \n\t"
2131
            "movq                 %%mm0, %%mm4      \n\t"
2132
            "punpckldq            %%mm3, %%mm0      \n\t"
2133
            "punpckhdq            %%mm3, %%mm4      \n\t"
2134
            "paddd                %%mm4, %%mm0      \n\t"
2135
            "psrad                   $7, %%mm0      \n\t"
2136
            "packssdw             %%mm0, %%mm0      \n\t"
2137
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2138
            "add                     $4, %%"REG_BP" \n\t"
2139
            " jnc                    1b             \n\t"
2140

    
2141
            "pop             %%"REG_BP"             \n\t"
2142
#if defined(PIC)
2143
            "pop              %%"REG_b"             \n\t"
2144
#endif
2145
            : "+a" (counter)
2146
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2147
#if !defined(PIC)
2148
            : "%"REG_b
2149
#endif
2150
        );
2151
    } else {
2152
        uint8_t *offset = src+filterSize;
2153
        x86_reg counter= -2*dstW;
2154
        //filter-= counter*filterSize/2;
2155
        filterPos-= counter/2;
2156
        dst-= counter/2;
2157
        __asm__ volatile(
2158
            "pxor                  %%mm7, %%mm7     \n\t"
2159
            ASMALIGN(4)
2160
            "1:                                     \n\t"
2161
            "mov                      %2, %%"REG_c" \n\t"
2162
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2163
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2164
            "mov                      %5, %%"REG_c" \n\t"
2165
            "pxor                  %%mm4, %%mm4     \n\t"
2166
            "pxor                  %%mm5, %%mm5     \n\t"
2167
            "2:                                     \n\t"
2168
            "movq                   (%1), %%mm1     \n\t"
2169
            "movq               (%1, %6), %%mm3     \n\t"
2170
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2171
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2172
            "punpcklbw             %%mm7, %%mm0     \n\t"
2173
            "punpcklbw             %%mm7, %%mm2     \n\t"
2174
            "pmaddwd               %%mm1, %%mm0     \n\t"
2175
            "pmaddwd               %%mm2, %%mm3     \n\t"
2176
            "paddd                 %%mm3, %%mm5     \n\t"
2177
            "paddd                 %%mm0, %%mm4     \n\t"
2178
            "add                      $8, %1        \n\t"
2179
            "add                      $4, %%"REG_c" \n\t"
2180
            "cmp                      %4, %%"REG_c" \n\t"
2181
            " jb                      2b            \n\t"
2182
            "add                      %6, %1        \n\t"
2183
            "movq                  %%mm4, %%mm0     \n\t"
2184
            "punpckldq             %%mm5, %%mm4     \n\t"
2185
            "punpckhdq             %%mm5, %%mm0     \n\t"
2186
            "paddd                 %%mm0, %%mm4     \n\t"
2187
            "psrad                    $7, %%mm4     \n\t"
2188
            "packssdw              %%mm4, %%mm4     \n\t"
2189
            "mov                      %3, %%"REG_a" \n\t"
2190
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2191
            "add                      $4, %0        \n\t"
2192
            " jnc                     1b            \n\t"
2193

    
2194
            : "+r" (counter), "+r" (filter)
2195
            : "m" (filterPos), "m" (dst), "m"(offset),
2196
            "m" (src), "r" ((x86_reg)filterSize*2)
2197
            : "%"REG_a, "%"REG_c, "%"REG_d
2198
        );
2199
    }
2200
#else
2201
#if COMPILE_TEMPLATE_ALTIVEC
2202
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2203
#else
2204
    int i;
2205
    for (i=0; i<dstW; i++) {
2206
        int j;
2207
        int srcPos= filterPos[i];
2208
        int val=0;
2209
        //printf("filterPos: %d\n", filterPos[i]);
2210
        for (j=0; j<filterSize; j++) {
2211
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2212
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2213
        }
2214
        //filter += hFilterSize;
2215
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2216
        //dst[i] = val>>7;
2217
    }
2218
#endif /* COMPILE_ALTIVEC */
2219
#endif /* COMPILE_MMX */
2220
}
2221

    
2222
//FIXME all pal and rgb srcFormats could do this convertion as well
2223
//FIXME all scalers more complex than bilinear could do half of this transform
2224
static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2225
{
2226
    int i;
2227
    for (i = 0; i < width; i++) {
2228
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2229
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2230
    }
2231
}
2232
static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2233
{
2234
    int i;
2235
    for (i = 0; i < width; i++) {
2236
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2237
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2238
    }
2239
}
2240
static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2241
{
2242
    int i;
2243
    for (i = 0; i < width; i++)
2244
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2245
}
2246
static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2247
{
2248
    int i;
2249
    for (i = 0; i < width; i++)
2250
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2251
}
2252

    
2253
#define FAST_BILINEAR_X86 \
2254
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2255
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2256
    "shll      $16, %%edi    \n\t"                                              \
2257
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2258
    "mov        %1, %%"REG_D"\n\t"                                              \
2259
    "shrl       $9, %%esi    \n\t"                                              \
2260

    
2261
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2262
                                        int dstWidth, const uint8_t *src, int srcW,
2263
                                        int xInc)
2264
{
2265
    int i;
2266
    unsigned int xpos=0;
2267
    for (i=0;i<dstWidth;i++) {
2268
        register unsigned int xx=xpos>>16;
2269
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2270
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2271
        xpos+=xInc;
2272
    }
2273
}
2274

    
2275
      // *** horizontal scale Y line to temp buffer
2276
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2277
                                   int flags, const int16_t *hLumFilter,
2278
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2279
                                   enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2280
                                   uint32_t *pal, int isAlpha)
2281
{
2282
    int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2283
    int16_t av_unused *mmx2Filter    = c->lumMmx2Filter;
2284
    int     av_unused canMMX2BeUsed  = c->canMMX2BeUsed;
2285
    void    av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
2286
    void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2287
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2288

    
2289
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2290

    
2291
    if (internal_func) {
2292
        internal_func(formatConvBuffer, src, srcW, pal);
2293
        src= formatConvBuffer;
2294
    }
2295

    
2296
#if COMPILE_TEMPLATE_MMX
2297
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2298
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2299
#else
2300
    if (!(flags&SWS_FAST_BILINEAR))
2301
#endif
2302
    {
2303
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2304
    } else { // fast bilinear upscale / crap downscale
2305
#if ARCH_X86 && CONFIG_GPL
2306
#if COMPILE_TEMPLATE_MMX2
2307
        int i;
2308
#if defined(PIC)
2309
        DECLARE_ALIGNED(8, uint64_t, ebxsave);
2310
#endif
2311
        if (canMMX2BeUsed) {
2312
            __asm__ volatile(
2313
#if defined(PIC)
2314
                "mov               %%"REG_b", %5        \n\t"
2315
#endif
2316
                "pxor                  %%mm7, %%mm7     \n\t"
2317
                "mov                      %0, %%"REG_c" \n\t"
2318
                "mov                      %1, %%"REG_D" \n\t"
2319
                "mov                      %2, %%"REG_d" \n\t"
2320
                "mov                      %3, %%"REG_b" \n\t"
2321
                "xor               %%"REG_a", %%"REG_a" \n\t" // i
2322
                PREFETCH"        (%%"REG_c")            \n\t"
2323
                PREFETCH"      32(%%"REG_c")            \n\t"
2324
                PREFETCH"      64(%%"REG_c")            \n\t"
2325

    
2326
#if ARCH_X86_64
2327

    
2328
#define CALL_MMX2_FILTER_CODE \
2329
                "movl            (%%"REG_b"), %%esi     \n\t"\
2330
                "call                    *%4            \n\t"\
2331
                "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2332
                "add               %%"REG_S", %%"REG_c" \n\t"\
2333
                "add               %%"REG_a", %%"REG_D" \n\t"\
2334
                "xor               %%"REG_a", %%"REG_a" \n\t"\
2335

    
2336
#else
2337

    
2338
#define CALL_MMX2_FILTER_CODE \
2339
                "movl (%%"REG_b"), %%esi        \n\t"\
2340
                "call         *%4                       \n\t"\
2341
                "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2342
                "add               %%"REG_a", %%"REG_D" \n\t"\
2343
                "xor               %%"REG_a", %%"REG_a" \n\t"\
2344

    
2345
#endif /* ARCH_X86_64 */
2346

    
2347
                CALL_MMX2_FILTER_CODE
2348
                CALL_MMX2_FILTER_CODE
2349
                CALL_MMX2_FILTER_CODE
2350
                CALL_MMX2_FILTER_CODE
2351
                CALL_MMX2_FILTER_CODE
2352
                CALL_MMX2_FILTER_CODE
2353
                CALL_MMX2_FILTER_CODE
2354
                CALL_MMX2_FILTER_CODE
2355

    
2356
#if defined(PIC)
2357
                "mov                      %5, %%"REG_b" \n\t"
2358
#endif
2359
                :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2360
                "m" (mmx2FilterCode)
2361
#if defined(PIC)
2362
                ,"m" (ebxsave)
2363
#endif
2364
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2365
#if !defined(PIC)
2366
                ,"%"REG_b
2367
#endif
2368
            );
2369
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2370
        } else {
2371
#endif /* COMPILE_TEMPLATE_MMX2 */
2372
        x86_reg xInc_shr16 = xInc >> 16;
2373
        uint16_t xInc_mask = xInc & 0xffff;
2374
        //NO MMX just normal asm ...
2375
        __asm__ volatile(
2376
            "xor %%"REG_a", %%"REG_a"            \n\t" // i
2377
            "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2378
            "xorl    %%ecx, %%ecx                \n\t" // xalpha
2379
            ASMALIGN(4)
2380
            "1:                                  \n\t"
2381
            "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2382
            "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2383
            FAST_BILINEAR_X86
2384
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2385
            "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2386
            "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2387

    
2388
            "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2389
            "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2390
            FAST_BILINEAR_X86
2391
            "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2392
            "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2393
            "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2394

    
2395

    
2396
            "add        $2, %%"REG_a"            \n\t"
2397
            "cmp        %2, %%"REG_a"            \n\t"
2398
            " jb        1b                       \n\t"
2399

    
2400

    
2401
            :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2402
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2403
        );
2404
#if COMPILE_TEMPLATE_MMX2
2405
        } //if MMX2 can't be used
2406
#endif
2407
#else
2408
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2409
#endif /* ARCH_X86 */
2410
    }
2411

    
2412
    if (convertRange)
2413
        convertRange(dst, dstWidth);
2414
}
2415

    
2416
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2417
                                        int dstWidth, const uint8_t *src1,
2418
                                        const uint8_t *src2, int srcW, int xInc)
2419
{
2420
    int i;
2421
    unsigned int xpos=0;
2422
    for (i=0;i<dstWidth;i++) {
2423
        register unsigned int xx=xpos>>16;
2424
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2425
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2426
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2427
        /* slower
2428
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2429
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2430
        */
2431
        xpos+=xInc;
2432
    }
2433
}
2434

    
2435
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2436
                                   int srcW, int xInc, int flags, const int16_t *hChrFilter,
2437
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2438
                                   enum PixelFormat srcFormat, uint8_t *formatConvBuffer,
2439
                                   uint32_t *pal)
2440
{
2441
    int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2442
    int16_t av_unused *mmx2Filter    = c->chrMmx2Filter;
2443
    int     av_unused canMMX2BeUsed  = c->canMMX2BeUsed;
2444
    void    av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
2445

    
2446
    if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2447
        return;
2448

    
2449
    src1 += c->chrSrcOffset;
2450
    src2 += c->chrSrcOffset;
2451

    
2452
    if (c->hcscale_internal) {
2453
        c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2454
        src1= formatConvBuffer;
2455
        src2= formatConvBuffer+VOFW;
2456
    }
2457

    
2458
#if COMPILE_TEMPLATE_MMX
2459
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2460
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2461
#else
2462
    if (!(flags&SWS_FAST_BILINEAR))
2463
#endif
2464
    {
2465
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2466
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2467
    } else { // fast bilinear upscale / crap downscale
2468
#if ARCH_X86 && CONFIG_GPL
2469
#if COMPILE_TEMPLATE_MMX2
2470
        int i;
2471
#if defined(PIC)
2472
        DECLARE_ALIGNED(8, uint64_t, ebxsave);
2473
#endif
2474
        if (canMMX2BeUsed) {
2475
            __asm__ volatile(
2476
#if defined(PIC)
2477
                "mov          %%"REG_b", %6         \n\t"
2478
#endif
2479
                "pxor             %%mm7, %%mm7      \n\t"
2480
                "mov                 %0, %%"REG_c"  \n\t"
2481
                "mov                 %1, %%"REG_D"  \n\t"
2482
                "mov                 %2, %%"REG_d"  \n\t"
2483
                "mov                 %3, %%"REG_b"  \n\t"
2484
                "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2485
                PREFETCH"   (%%"REG_c")             \n\t"
2486
                PREFETCH" 32(%%"REG_c")             \n\t"
2487
                PREFETCH" 64(%%"REG_c")             \n\t"
2488

    
2489
                CALL_MMX2_FILTER_CODE
2490
                CALL_MMX2_FILTER_CODE
2491
                CALL_MMX2_FILTER_CODE
2492
                CALL_MMX2_FILTER_CODE
2493
                "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2494
                "mov                 %5, %%"REG_c"  \n\t" // src
2495
                "mov                 %1, %%"REG_D"  \n\t" // buf1
2496
                "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2497
                PREFETCH"   (%%"REG_c")             \n\t"
2498
                PREFETCH" 32(%%"REG_c")             \n\t"
2499
                PREFETCH" 64(%%"REG_c")             \n\t"
2500

    
2501
                CALL_MMX2_FILTER_CODE
2502
                CALL_MMX2_FILTER_CODE
2503
                CALL_MMX2_FILTER_CODE
2504
                CALL_MMX2_FILTER_CODE
2505

    
2506
#if defined(PIC)
2507
                "mov %6, %%"REG_b"    \n\t"
2508
#endif
2509
                :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2510
                "m" (mmx2FilterCode), "m" (src2)
2511
#if defined(PIC)
2512
                ,"m" (ebxsave)
2513
#endif
2514
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2515
#if !defined(PIC)
2516
                ,"%"REG_b
2517
#endif
2518
            );
2519
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2520
                //printf("%d %d %d\n", dstWidth, i, srcW);
2521
                dst[i] = src1[srcW-1]*128;
2522
                dst[i+VOFW] = src2[srcW-1]*128;
2523
            }
2524
        } else {
2525
#endif /* COMPILE_TEMPLATE_MMX2 */
2526
            x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2527
            uint16_t xInc_mask = xInc & 0xffff;
2528
            __asm__ volatile(
2529
                "xor %%"REG_a", %%"REG_a"               \n\t" // i
2530
                "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2531
                "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2532
                ASMALIGN(4)
2533
                "1:                                     \n\t"
2534
                "mov        %0, %%"REG_S"               \n\t"
2535
                "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2536
                "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2537
                FAST_BILINEAR_X86
2538
                "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2539

    
2540
                "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2541
                "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2542
                FAST_BILINEAR_X86
2543
                "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2544

    
2545
                "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2546
                "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2547
                "add        $1, %%"REG_a"               \n\t"
2548
                "cmp        %2, %%"REG_a"               \n\t"
2549
                " jb        1b                          \n\t"
2550

    
2551
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2552
   which is needed to support GCC 4.0. */
2553
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2554
                :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2555
#else
2556
                :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2557
#endif
2558
                "r" (src2)
2559
                : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2560
            );
2561
#if COMPILE_TEMPLATE_MMX2
2562
        } //if MMX2 can't be used
2563
#endif
2564
#else
2565
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2566
#endif /* ARCH_X86 */
2567
    }
2568

    
2569
    if (c->chrConvertRange)
2570
        c->chrConvertRange(dst, dstWidth);
2571
}
2572

    
2573
#define DEBUG_SWSCALE_BUFFERS 0
2574
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2575

    
2576
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2577
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2578
{
2579
    /* load a few things into local vars to make the code more readable? and faster */
2580
    const int srcW= c->srcW;
2581
    const int dstW= c->dstW;
2582
    const int dstH= c->dstH;
2583
    const int chrDstW= c->chrDstW;
2584
    const int chrSrcW= c->chrSrcW;
2585
    const int lumXInc= c->lumXInc;
2586
    const int chrXInc= c->chrXInc;
2587
    const enum PixelFormat dstFormat= c->dstFormat;
2588
    const enum PixelFormat srcFormat= c->srcFormat;
2589
    const int flags= c->flags;
2590
    int16_t *vLumFilterPos= c->vLumFilterPos;
2591
    int16_t *vChrFilterPos= c->vChrFilterPos;
2592
    int16_t *hLumFilterPos= c->hLumFilterPos;
2593
    int16_t *hChrFilterPos= c->hChrFilterPos;
2594
    int16_t *vLumFilter= c->vLumFilter;
2595
    int16_t *vChrFilter= c->vChrFilter;
2596
    int16_t *hLumFilter= c->hLumFilter;
2597
    int16_t *hChrFilter= c->hChrFilter;
2598
    int32_t *lumMmxFilter= c->lumMmxFilter;
2599
    int32_t *chrMmxFilter= c->chrMmxFilter;
2600
    int32_t *alpMmxFilter= c->alpMmxFilter;
2601
    const int vLumFilterSize= c->vLumFilterSize;
2602
    const int vChrFilterSize= c->vChrFilterSize;
2603
    const int hLumFilterSize= c->hLumFilterSize;
2604
    const int hChrFilterSize= c->hChrFilterSize;
2605
    int16_t **lumPixBuf= c->lumPixBuf;
2606
    int16_t **chrPixBuf= c->chrPixBuf;
2607
    int16_t **alpPixBuf= c->alpPixBuf;
2608
    const int vLumBufSize= c->vLumBufSize;
2609
    const int vChrBufSize= c->vChrBufSize;
2610
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2611
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2612
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2613
    int lastDstY;
2614
    uint32_t *pal=c->pal_yuv;
2615

    
2616
    /* vars which will change and which we need to store back in the context */
2617
    int dstY= c->dstY;
2618
    int lumBufIndex= c->lumBufIndex;
2619
    int chrBufIndex= c->chrBufIndex;
2620
    int lastInLumBuf= c->lastInLumBuf;
2621
    int lastInChrBuf= c->lastInChrBuf;
2622

    
2623
    if (isPacked(c->srcFormat)) {
2624
        src[0]=
2625
        src[1]=
2626
        src[2]=
2627
        src[3]= src[0];
2628
        srcStride[0]=
2629
        srcStride[1]=
2630
        srcStride[2]=
2631
        srcStride[3]= srcStride[0];
2632
    }
2633
    srcStride[1]<<= c->vChrDrop;
2634
    srcStride[2]<<= c->vChrDrop;
2635

    
2636
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2637
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2638
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2639
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2640
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2641
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2642
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2643

    
2644
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2645
        static int warnedAlready=0; //FIXME move this into the context perhaps
2646
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2647
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2648
                   "         ->cannot do aligned memory accesses anymore\n");
2649
            warnedAlready=1;
2650
        }
2651
    }
2652

    
2653
    /* Note the user might start scaling the picture in the middle so this
2654
       will not get executed. This is not really intended but works
2655
       currently, so people might do it. */
2656
    if (srcSliceY ==0) {
2657
        lumBufIndex=-1;
2658
        chrBufIndex=-1;
2659
        dstY=0;
2660
        lastInLumBuf= -1;
2661
        lastInChrBuf= -1;
2662
    }
2663

    
2664
    lastDstY= dstY;
2665

    
2666
    for (;dstY < dstH; dstY++) {
2667
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2668
        const int chrDstY= dstY>>c->chrDstVSubSample;
2669
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2670
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2671
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2672

    
2673
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2674
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2675
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2676
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2677
        int enough_lines;
2678

    
2679
        //handle holes (FAST_BILINEAR & weird filters)
2680
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2681
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2682
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2683
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2684

    
2685
        // Do we have enough lines in this slice to output the dstY line
2686
        enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2687
        if (!enough_lines) {
2688
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2689
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2690
        }
2691

    
2692
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2693
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2694
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2695
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2696
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2697

    
2698
        //Do horizontal scaling
2699
        while(lastInLumBuf < lastLumSrcY) {
2700
            uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2701
            uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2702
            lumBufIndex++;
2703
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2704
                               lumBufIndex,    lastInLumBuf);
2705
            assert(lumBufIndex < 2*vLumBufSize);
2706
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2707
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2708
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2709
                            flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2710
                            c->srcFormat, formatConvBuffer,
2711
                            pal, 0);
2712
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2713
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2714
                                flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2715
                                c->srcFormat, formatConvBuffer,
2716
                                pal, 1);
2717
            lastInLumBuf++;
2718
        }
2719
        while(lastInChrBuf < lastChrSrcY) {
2720
            uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2721
            uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2722
            chrBufIndex++;
2723
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2724
                               chrBufIndex,    lastInChrBuf);
2725
            assert(chrBufIndex < 2*vChrBufSize);
2726
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2727
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2728
            //FIXME replace parameters through context struct (some at least)
2729

    
2730
            if (!(isGray(srcFormat) || isGray(dstFormat)))
2731
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2732
                                flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2733
                                c->srcFormat, formatConvBuffer,
2734
                                pal);
2735
            lastInChrBuf++;
2736
        }
2737
        //wrap buf index around to stay inside the ring buffer
2738
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2739
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2740
        if (!enough_lines)
2741
            break; //we can't output a dstY line so let's try with the next slice
2742

    
2743
#if COMPILE_TEMPLATE_MMX
2744
        c->blueDither= ff_dither8[dstY&1];
2745
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2746
            c->greenDither= ff_dither8[dstY&1];
2747
        else
2748
            c->greenDither= ff_dither4[dstY&1];
2749
        c->redDither= ff_dither8[(dstY+1)&1];
2750
#endif
2751
        if (dstY < dstH-2) {
2752
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2753
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2754
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2755
#if COMPILE_TEMPLATE_MMX
2756
            int i;
2757
            if (flags & SWS_ACCURATE_RND) {
2758
                int s= APCK_SIZE / 8;
2759
                for (i=0; i<vLumFilterSize; i+=2) {
2760
                    *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2761
                    *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2762
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2763
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2764
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2765
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2766
                        *(void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2767
                        *(void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2768
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2769
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2770
                    }
2771
                }
2772
                for (i=0; i<vChrFilterSize; i+=2) {
2773
                    *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2774
                    *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2775
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2776
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2777
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2778
                }
2779
            } else {
2780
                for (i=0; i<vLumFilterSize; i++) {
2781
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2782
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2783
                    lumMmxFilter[4*i+2]=
2784
                    lumMmxFilter[4*i+3]=
2785
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2786
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2787
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2788
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2789
                        alpMmxFilter[4*i+2]=
2790
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2791
                    }
2792
                }
2793
                for (i=0; i<vChrFilterSize; i++) {
2794
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2795
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2796
                    chrMmxFilter[4*i+2]=
2797
                    chrMmxFilter[4*i+3]=
2798
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2799
                }
2800
            }
2801
#endif
2802
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2803
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2804
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2805
                c->yuv2nv12X(c,
2806
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2807
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2808
                             dest, uDest, dstW, chrDstW, dstFormat);
2809
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2810
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2811
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2812
                if (is16BPS(dstFormat)) {
2813
                    yuv2yuvX16inC(
2814
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2815
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2816
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2817
                                  dstFormat);
2818
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2819
                    int16_t *lumBuf = lumSrcPtr[0];
2820
                    int16_t *chrBuf= chrSrcPtr[0];
2821
                    int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2822
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2823
                } else { //General YV12
2824
                    c->yuv2yuvX(c,
2825
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2826
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2827
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2828
                }
2829
            } else {
2830
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2831
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2832
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2833
                    int chrAlpha= vChrFilter[2*dstY+1];
2834
                    if(flags & SWS_FULL_CHR_H_INT) {
2835
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2836
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2837
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2838
                                         alpSrcPtr, dest, dstW, dstY);
2839
                    } else {
2840
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2841
                                       alpPixBuf ? *alpSrcPtr : NULL,
2842
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2843
                    }
2844
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2845
                    int lumAlpha= vLumFilter[2*dstY+1];
2846
                    int chrAlpha= vChrFilter[2*dstY+1];
2847
                    lumMmxFilter[2]=
2848
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2849
                    chrMmxFilter[2]=
2850
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2851
                    if(flags & SWS_FULL_CHR_H_INT) {
2852
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2853
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2854
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2855
                                         alpSrcPtr, dest, dstW, dstY);
2856
                    } else {
2857
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2858
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2859
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2860
                    }
2861
                } else { //general RGB
2862
                    if(flags & SWS_FULL_CHR_H_INT) {
2863
                        yuv2rgbXinC_full(c,
2864
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2865
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2866
                                         alpSrcPtr, dest, dstW, dstY);
2867
                    } else {
2868
                        c->yuv2packedX(c,
2869
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2870
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2871
                                       alpSrcPtr, dest, dstW, dstY);
2872
                    }
2873
                }
2874
            }
2875
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2876
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2877
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2878
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2879
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2880
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2881
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2882
                yuv2nv12XinC(
2883
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2884
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2885
                             dest, uDest, dstW, chrDstW, dstFormat);
2886
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2887
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2888
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2889
                if (is16BPS(dstFormat)) {
2890
                    yuv2yuvX16inC(
2891
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2892
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2894
                                  dstFormat);
2895
                } else {
2896
                    yuv2yuvXinC(
2897
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2898
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2899
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2900
                }
2901
            } else {
2902
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2903
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2904
                if(flags & SWS_FULL_CHR_H_INT) {
2905
                    yuv2rgbXinC_full(c,
2906
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2907
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2908
                                     alpSrcPtr, dest, dstW, dstY);
2909
                } else {
2910
                    yuv2packedXinC(c,
2911
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2912
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2913
                                   alpSrcPtr, dest, dstW, dstY);
2914
                }
2915
            }
2916
        }
2917
    }
2918

    
2919
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2920
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2921

    
2922
#if COMPILE_TEMPLATE_MMX
2923
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2924
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2925
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2926
    else                             __asm__ volatile("emms"  :::"memory");
2927
#endif
2928
    /* store changed local vars back in the context */
2929
    c->dstY= dstY;
2930
    c->lumBufIndex= lumBufIndex;
2931
    c->chrBufIndex= chrBufIndex;
2932
    c->lastInLumBuf= lastInLumBuf;
2933
    c->lastInChrBuf= lastInChrBuf;
2934

    
2935
    return dstY - lastDstY;
2936
}
2937

    
2938
static void RENAME(sws_init_swScale)(SwsContext *c)
2939
{
2940
    enum PixelFormat srcFormat = c->srcFormat;
2941

    
2942
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2943
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2944
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2945
    c->yuv2packed1  = RENAME(yuv2packed1 );
2946
    c->yuv2packed2  = RENAME(yuv2packed2 );
2947
    c->yuv2packedX  = RENAME(yuv2packedX );
2948

    
2949
    c->hScale       = RENAME(hScale      );
2950

    
2951
    c->hyscale_fast = RENAME(hyscale_fast);
2952
    c->hcscale_fast = RENAME(hcscale_fast);
2953

    
2954
    c->hcscale_internal = NULL;
2955
    switch(srcFormat) {
2956
        case PIX_FMT_YUYV422  : c->hcscale_internal = RENAME(yuy2ToUV); break;
2957
        case PIX_FMT_UYVY422  : c->hcscale_internal = RENAME(uyvyToUV); break;
2958
        case PIX_FMT_NV12     : c->hcscale_internal = RENAME(nv12ToUV); break;
2959
        case PIX_FMT_NV21     : c->hcscale_internal = RENAME(nv21ToUV); break;
2960
        case PIX_FMT_RGB8     :
2961
        case PIX_FMT_BGR8     :
2962
        case PIX_FMT_PAL8     :
2963
        case PIX_FMT_BGR4_BYTE:
2964
        case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
2965
        case PIX_FMT_YUV420P16BE:
2966
        case PIX_FMT_YUV422P16BE:
2967
        case PIX_FMT_YUV444P16BE: c->hcscale_internal = RENAME(BEToUV); break;
2968
        case PIX_FMT_YUV420P16LE:
2969
        case PIX_FMT_YUV422P16LE:
2970
        case PIX_FMT_YUV444P16LE: c->hcscale_internal = RENAME(LEToUV); break;
2971
    }
2972
    if (c->chrSrcHSubSample) {
2973
        switch(srcFormat) {
2974
        case PIX_FMT_RGB48BE:
2975
        case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
2976
        case PIX_FMT_RGB32  :
2977
        case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
2978
        case PIX_FMT_BGR24  : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
2979
        case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
2980
        case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
2981
        case PIX_FMT_BGR32  :
2982
        case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
2983
        case PIX_FMT_RGB24  : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
2984
        case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
2985
        case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
2986
        }
2987
    } else {
2988
        switch(srcFormat) {
2989
        case PIX_FMT_RGB48BE:
2990
        case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV; break;
2991
        case PIX_FMT_RGB32  :
2992
        case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV; break;
2993
        case PIX_FMT_BGR24  : c->hcscale_internal = RENAME(bgr24ToUV); break;
2994
        case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV; break;
2995
        case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV; break;
2996
        case PIX_FMT_BGR32  :
2997
        case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV; break;
2998
        case PIX_FMT_RGB24  : c->hcscale_internal = RENAME(rgb24ToUV); break;
2999
        case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV; break;
3000
        case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV; break;
3001
        }
3002
    }
3003

    
3004
    c->hyscale_internal = NULL;
3005
    c->hascale_internal = NULL;
3006
    switch (srcFormat) {
3007
    case PIX_FMT_YUYV422  :
3008
    case PIX_FMT_YUV420P16BE:
3009
    case PIX_FMT_YUV422P16BE:
3010
    case PIX_FMT_YUV444P16BE:
3011
    case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
3012
    case PIX_FMT_UYVY422  :
3013
    case PIX_FMT_YUV420P16LE:
3014
    case PIX_FMT_YUV422P16LE:
3015
    case PIX_FMT_YUV444P16LE:
3016
    case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
3017
    case PIX_FMT_BGR24    : c->hyscale_internal = RENAME(bgr24ToY); break;
3018
    case PIX_FMT_BGR565   : c->hyscale_internal = bgr16ToY; break;
3019
    case PIX_FMT_BGR555   : c->hyscale_internal = bgr15ToY; break;
3020
    case PIX_FMT_RGB24    : c->hyscale_internal = RENAME(rgb24ToY); break;
3021
    case PIX_FMT_RGB565   : c->hyscale_internal = rgb16ToY; break;
3022
    case PIX_FMT_RGB555   : c->hyscale_internal = rgb15ToY; break;
3023
    case PIX_FMT_RGB8     :
3024
    case PIX_FMT_BGR8     :
3025
    case PIX_FMT_PAL8     :
3026
    case PIX_FMT_BGR4_BYTE:
3027
    case PIX_FMT_RGB4_BYTE: c->hyscale_internal = palToY; break;
3028
    case PIX_FMT_MONOBLACK: c->hyscale_internal = monoblack2Y; break;
3029
    case PIX_FMT_MONOWHITE: c->hyscale_internal = monowhite2Y; break;
3030
    case PIX_FMT_RGB32  :
3031
    case PIX_FMT_RGB32_1: c->hyscale_internal = bgr32ToY; break;
3032
    case PIX_FMT_BGR32  :
3033
    case PIX_FMT_BGR32_1: c->hyscale_internal = rgb32ToY; break;
3034
    case PIX_FMT_RGB48BE:
3035
    case PIX_FMT_RGB48LE: c->hyscale_internal = rgb48ToY; break;
3036
    }
3037
    if (c->alpPixBuf) {
3038
        switch (srcFormat) {
3039
        case PIX_FMT_RGB32  :
3040
        case PIX_FMT_RGB32_1:
3041
        case PIX_FMT_BGR32  :
3042
        case PIX_FMT_BGR32_1: c->hascale_internal = abgrToA; break;
3043
        }
3044
    }
3045

    
3046
    switch (srcFormat) {
3047
    case PIX_FMT_RGB32  :
3048
    case PIX_FMT_BGR32  :
3049
        c->alpSrcOffset = 3;
3050
        break;
3051
    case PIX_FMT_RGB32_1:
3052
    case PIX_FMT_BGR32_1:
3053
        c->lumSrcOffset = ALT32_CORR;
3054
        c->chrSrcOffset = ALT32_CORR;
3055
        break;
3056
    case PIX_FMT_RGB48LE:
3057
        c->lumSrcOffset = 1;
3058
        c->chrSrcOffset = 1;
3059
        c->alpSrcOffset = 1;
3060
        break;
3061
    }
3062

    
3063
    if (c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
3064
        if (c->srcRange) {
3065
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
3066
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
3067
        } else {
3068
            c->lumConvertRange = RENAME(lumRangeToJpeg);
3069
            c->chrConvertRange = RENAME(chrRangeToJpeg);
3070
        }
3071
    }
3072
}