Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 116758a3

History | View | Annotate | Download (139 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

    
21
#undef REAL_MOVNTQ
22
#undef MOVNTQ
23
#undef PAVGB
24
#undef PREFETCH
25

    
26
#if COMPILE_TEMPLATE_AMD3DNOW
27
#define PREFETCH  "prefetch"
28
#elif COMPILE_TEMPLATE_MMX2
29
#define PREFETCH "prefetchnta"
30
#else
31
#define PREFETCH  " # nop"
32
#endif
33

    
34
#if COMPILE_TEMPLATE_MMX2
35
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
36
#elif COMPILE_TEMPLATE_AMD3DNOW
37
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
38
#endif
39

    
40
#if COMPILE_TEMPLATE_MMX2
41
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
42
#else
43
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
44
#endif
45
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
46

    
47
#if COMPILE_TEMPLATE_ALTIVEC
48
#include "ppc/swscale_altivec_template.c"
49
#endif
50

    
51
#define YSCALEYUV2YV12X(x, offset, dest, width) \
52
    __asm__ volatile(\
53
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
54
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
55
        "movq                             %%mm3, %%mm4      \n\t"\
56
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
57
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
58
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
59
        "1:                                                 \n\t"\
60
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
61
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
62
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
63
        "add                                $16, %%"REG_d"  \n\t"\
64
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
66
        "pmulhw                           %%mm0, %%mm2      \n\t"\
67
        "pmulhw                           %%mm0, %%mm5      \n\t"\
68
        "paddw                            %%mm2, %%mm3      \n\t"\
69
        "paddw                            %%mm5, %%mm4      \n\t"\
70
        " jnz                                1b             \n\t"\
71
        "psraw                               $3, %%mm3      \n\t"\
72
        "psraw                               $3, %%mm4      \n\t"\
73
        "packuswb                         %%mm4, %%mm3      \n\t"\
74
        MOVNTQ(%%mm3, (%1, %%REGa))\
75
        "add                                 $8, %%"REG_a"  \n\t"\
76
        "cmp                                 %2, %%"REG_a"  \n\t"\
77
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
78
        "movq                             %%mm3, %%mm4      \n\t"\
79
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
80
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
81
        "jb                                  1b             \n\t"\
82
        :: "r" (&c->redDither),\
83
        "r" (dest), "g" ((x86_reg)width)\
84
        : "%"REG_a, "%"REG_d, "%"REG_S\
85
    );
86

    
87
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
88
    __asm__ volatile(\
89
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
90
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
91
        "pxor                             %%mm4, %%mm4      \n\t"\
92
        "pxor                             %%mm5, %%mm5      \n\t"\
93
        "pxor                             %%mm6, %%mm6      \n\t"\
94
        "pxor                             %%mm7, %%mm7      \n\t"\
95
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
96
        ".p2align                             4             \n\t"\
97
        "1:                                                 \n\t"\
98
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
99
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
100
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
101
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
102
        "movq                             %%mm0, %%mm3      \n\t"\
103
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
104
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
105
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
106
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
107
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
108
        "paddd                            %%mm0, %%mm4      \n\t"\
109
        "paddd                            %%mm3, %%mm5      \n\t"\
110
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
111
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
112
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
113
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
114
        "movq                             %%mm2, %%mm0      \n\t"\
115
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
116
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
117
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
118
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
119
        "paddd                            %%mm2, %%mm6      \n\t"\
120
        "paddd                            %%mm0, %%mm7      \n\t"\
121
        " jnz                                1b             \n\t"\
122
        "psrad                              $16, %%mm4      \n\t"\
123
        "psrad                              $16, %%mm5      \n\t"\
124
        "psrad                              $16, %%mm6      \n\t"\
125
        "psrad                              $16, %%mm7      \n\t"\
126
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
127
        "packssdw                         %%mm5, %%mm4      \n\t"\
128
        "packssdw                         %%mm7, %%mm6      \n\t"\
129
        "paddw                            %%mm0, %%mm4      \n\t"\
130
        "paddw                            %%mm0, %%mm6      \n\t"\
131
        "psraw                               $3, %%mm4      \n\t"\
132
        "psraw                               $3, %%mm6      \n\t"\
133
        "packuswb                         %%mm6, %%mm4      \n\t"\
134
        MOVNTQ(%%mm4, (%1, %%REGa))\
135
        "add                                 $8, %%"REG_a"  \n\t"\
136
        "cmp                                 %2, %%"REG_a"  \n\t"\
137
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
138
        "pxor                             %%mm4, %%mm4      \n\t"\
139
        "pxor                             %%mm5, %%mm5      \n\t"\
140
        "pxor                             %%mm6, %%mm6      \n\t"\
141
        "pxor                             %%mm7, %%mm7      \n\t"\
142
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
143
        "jb                                  1b             \n\t"\
144
        :: "r" (&c->redDither),\
145
        "r" (dest), "g" ((x86_reg)width)\
146
        : "%"REG_a, "%"REG_d, "%"REG_S\
147
    );
148

    
149
#define YSCALEYUV2YV121 \
150
    "mov %2, %%"REG_a"                    \n\t"\
151
    ".p2align               4             \n\t" /* FIXME Unroll? */\
152
    "1:                                   \n\t"\
153
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
154
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
155
    "psraw                 $7, %%mm0      \n\t"\
156
    "psraw                 $7, %%mm1      \n\t"\
157
    "packuswb           %%mm1, %%mm0      \n\t"\
158
    MOVNTQ(%%mm0, (%1, %%REGa))\
159
    "add                   $8, %%"REG_a"  \n\t"\
160
    "jnc                   1b             \n\t"
161

    
162
#define YSCALEYUV2YV121_ACCURATE \
163
    "mov %2, %%"REG_a"                    \n\t"\
164
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
165
    "psrlw                 $15, %%mm7     \n\t"\
166
    "psllw                  $6, %%mm7     \n\t"\
167
    ".p2align                4            \n\t" /* FIXME Unroll? */\
168
    "1:                                   \n\t"\
169
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
170
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
171
    "paddsw             %%mm7, %%mm0      \n\t"\
172
    "paddsw             %%mm7, %%mm1      \n\t"\
173
    "psraw                 $7, %%mm0      \n\t"\
174
    "psraw                 $7, %%mm1      \n\t"\
175
    "packuswb           %%mm1, %%mm0      \n\t"\
176
    MOVNTQ(%%mm0, (%1, %%REGa))\
177
    "add                   $8, %%"REG_a"  \n\t"\
178
    "jnc                   1b             \n\t"
179

    
180
/*
181
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
182
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
183
       "r" (dest), "m" (dstW_reg),
184
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
185
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
186
*/
187
#define YSCALEYUV2PACKEDX_UV \
188
    __asm__ volatile(\
189
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
190
        ".p2align                      4                \n\t"\
191
        "nop                                            \n\t"\
192
        "1:                                             \n\t"\
193
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
194
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
195
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
196
        "movq                      %%mm3, %%mm4         \n\t"\
197
        ".p2align                      4                \n\t"\
198
        "2:                                             \n\t"\
199
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
200
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
201
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
202
        "add                         $16, %%"REG_d"     \n\t"\
203
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
204
        "pmulhw                    %%mm0, %%mm2         \n\t"\
205
        "pmulhw                    %%mm0, %%mm5         \n\t"\
206
        "paddw                     %%mm2, %%mm3         \n\t"\
207
        "paddw                     %%mm5, %%mm4         \n\t"\
208
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
209
        " jnz                         2b                \n\t"\
210

    
211
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
212
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
213
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
214
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
215
    "movq                    "#dst1", "#dst2"       \n\t"\
216
    ".p2align                      4                \n\t"\
217
    "2:                                             \n\t"\
218
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
219
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
220
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
221
    "add                         $16, %%"REG_d"            \n\t"\
222
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
223
    "pmulhw                 "#coeff", "#src1"       \n\t"\
224
    "pmulhw                 "#coeff", "#src2"       \n\t"\
225
    "paddw                   "#src1", "#dst1"       \n\t"\
226
    "paddw                   "#src2", "#dst2"       \n\t"\
227
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
228
    " jnz                         2b                \n\t"\
229

    
230
#define YSCALEYUV2PACKEDX \
231
    YSCALEYUV2PACKEDX_UV \
232
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
233

    
234
#define YSCALEYUV2PACKEDX_END                     \
235
        :: "r" (&c->redDither),                   \
236
            "m" (dummy), "m" (dummy), "m" (dummy),\
237
            "r" (dest), "m" (dstW_reg)            \
238
        : "%"REG_a, "%"REG_d, "%"REG_S            \
239
    );
240

    
241
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
242
    __asm__ volatile(\
243
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
244
        ".p2align                      4                \n\t"\
245
        "nop                                            \n\t"\
246
        "1:                                             \n\t"\
247
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
248
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
249
        "pxor                      %%mm4, %%mm4         \n\t"\
250
        "pxor                      %%mm5, %%mm5         \n\t"\
251
        "pxor                      %%mm6, %%mm6         \n\t"\
252
        "pxor                      %%mm7, %%mm7         \n\t"\
253
        ".p2align                      4                \n\t"\
254
        "2:                                             \n\t"\
255
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
256
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
257
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
258
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
259
        "movq                      %%mm0, %%mm3         \n\t"\
260
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
261
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
262
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
263
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
264
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
265
        "paddd                     %%mm0, %%mm4         \n\t"\
266
        "paddd                     %%mm3, %%mm5         \n\t"\
267
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
268
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
269
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
270
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
271
        "movq                      %%mm2, %%mm0         \n\t"\
272
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
273
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
274
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
275
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
276
        "paddd                     %%mm2, %%mm6         \n\t"\
277
        "paddd                     %%mm0, %%mm7         \n\t"\
278
        " jnz                         2b                \n\t"\
279
        "psrad                       $16, %%mm4         \n\t"\
280
        "psrad                       $16, %%mm5         \n\t"\
281
        "psrad                       $16, %%mm6         \n\t"\
282
        "psrad                       $16, %%mm7         \n\t"\
283
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
284
        "packssdw                  %%mm5, %%mm4         \n\t"\
285
        "packssdw                  %%mm7, %%mm6         \n\t"\
286
        "paddw                     %%mm0, %%mm4         \n\t"\
287
        "paddw                     %%mm0, %%mm6         \n\t"\
288
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
289
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
290

    
291
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
292
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
293
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
294
    "pxor                      %%mm1, %%mm1         \n\t"\
295
    "pxor                      %%mm5, %%mm5         \n\t"\
296
    "pxor                      %%mm7, %%mm7         \n\t"\
297
    "pxor                      %%mm6, %%mm6         \n\t"\
298
    ".p2align                      4                \n\t"\
299
    "2:                                             \n\t"\
300
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
301
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
302
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
303
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
304
    "movq                      %%mm0, %%mm3         \n\t"\
305
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
306
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
307
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
308
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
309
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
310
    "paddd                     %%mm0, %%mm1         \n\t"\
311
    "paddd                     %%mm3, %%mm5         \n\t"\
312
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
313
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
314
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
315
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
316
    "movq                      %%mm2, %%mm0         \n\t"\
317
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
318
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
319
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
320
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
321
    "paddd                     %%mm2, %%mm7         \n\t"\
322
    "paddd                     %%mm0, %%mm6         \n\t"\
323
    " jnz                         2b                \n\t"\
324
    "psrad                       $16, %%mm1         \n\t"\
325
    "psrad                       $16, %%mm5         \n\t"\
326
    "psrad                       $16, %%mm7         \n\t"\
327
    "psrad                       $16, %%mm6         \n\t"\
328
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
329
    "packssdw                  %%mm5, %%mm1         \n\t"\
330
    "packssdw                  %%mm6, %%mm7         \n\t"\
331
    "paddw                     %%mm0, %%mm1         \n\t"\
332
    "paddw                     %%mm0, %%mm7         \n\t"\
333
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
334
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
335

    
336
#define YSCALEYUV2PACKEDX_ACCURATE \
337
    YSCALEYUV2PACKEDX_ACCURATE_UV \
338
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
339

    
340
#define YSCALEYUV2RGBX \
341
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
342
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
343
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
344
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
345
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
346
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
347
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
348
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
349
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
350
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
351
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
352
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
353
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
354
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
355
    "paddw           %%mm3, %%mm4       \n\t"\
356
    "movq            %%mm2, %%mm0       \n\t"\
357
    "movq            %%mm5, %%mm6       \n\t"\
358
    "movq            %%mm4, %%mm3       \n\t"\
359
    "punpcklwd       %%mm2, %%mm2       \n\t"\
360
    "punpcklwd       %%mm5, %%mm5       \n\t"\
361
    "punpcklwd       %%mm4, %%mm4       \n\t"\
362
    "paddw           %%mm1, %%mm2       \n\t"\
363
    "paddw           %%mm1, %%mm5       \n\t"\
364
    "paddw           %%mm1, %%mm4       \n\t"\
365
    "punpckhwd       %%mm0, %%mm0       \n\t"\
366
    "punpckhwd       %%mm6, %%mm6       \n\t"\
367
    "punpckhwd       %%mm3, %%mm3       \n\t"\
368
    "paddw           %%mm7, %%mm0       \n\t"\
369
    "paddw           %%mm7, %%mm6       \n\t"\
370
    "paddw           %%mm7, %%mm3       \n\t"\
371
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
372
    "packuswb        %%mm0, %%mm2       \n\t"\
373
    "packuswb        %%mm6, %%mm5       \n\t"\
374
    "packuswb        %%mm3, %%mm4       \n\t"\
375

    
376
#define REAL_YSCALEYUV2PACKED(index, c) \
377
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
378
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
379
    "psraw                $3, %%mm0                           \n\t"\
380
    "psraw                $3, %%mm1                           \n\t"\
381
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
382
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
383
    "xor            "#index", "#index"                        \n\t"\
384
    ".p2align              4            \n\t"\
385
    "1:                                 \n\t"\
386
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
387
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
388
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
389
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
390
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
391
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
392
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
393
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
394
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
395
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
396
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
397
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
398
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
399
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
400
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
401
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
402
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
403
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
404
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
406
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
408
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
410
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
411

    
412
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
413

    
414
#define REAL_YSCALEYUV2RGB_UV(index, c) \
415
    "xor            "#index", "#index"  \n\t"\
416
    ".p2align              4            \n\t"\
417
    "1:                                 \n\t"\
418
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
419
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
420
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
421
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
422
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
423
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
424
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
425
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
426
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
427
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
428
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
429
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
430
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
431
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
432
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
433
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
434
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
435
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
436
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
437
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
438

    
439
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
440
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
441
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
442
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
443
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
444
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
445
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
446
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
447
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
448
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
449
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
450
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
451
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
452

    
453
#define REAL_YSCALEYUV2RGB_COEFF(c) \
454
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
455
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
456
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
457
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
458
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
459
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
460
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
461
    "paddw             %%mm3, %%mm4     \n\t"\
462
    "movq              %%mm2, %%mm0     \n\t"\
463
    "movq              %%mm5, %%mm6     \n\t"\
464
    "movq              %%mm4, %%mm3     \n\t"\
465
    "punpcklwd         %%mm2, %%mm2     \n\t"\
466
    "punpcklwd         %%mm5, %%mm5     \n\t"\
467
    "punpcklwd         %%mm4, %%mm4     \n\t"\
468
    "paddw             %%mm1, %%mm2     \n\t"\
469
    "paddw             %%mm1, %%mm5     \n\t"\
470
    "paddw             %%mm1, %%mm4     \n\t"\
471
    "punpckhwd         %%mm0, %%mm0     \n\t"\
472
    "punpckhwd         %%mm6, %%mm6     \n\t"\
473
    "punpckhwd         %%mm3, %%mm3     \n\t"\
474
    "paddw             %%mm7, %%mm0     \n\t"\
475
    "paddw             %%mm7, %%mm6     \n\t"\
476
    "paddw             %%mm7, %%mm3     \n\t"\
477
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
478
    "packuswb          %%mm0, %%mm2     \n\t"\
479
    "packuswb          %%mm6, %%mm5     \n\t"\
480
    "packuswb          %%mm3, %%mm4     \n\t"\
481

    
482
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
483

    
484
#define YSCALEYUV2RGB(index, c) \
485
    REAL_YSCALEYUV2RGB_UV(index, c) \
486
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
487
    REAL_YSCALEYUV2RGB_COEFF(c)
488

    
489
#define REAL_YSCALEYUV2PACKED1(index, c) \
490
    "xor            "#index", "#index"  \n\t"\
491
    ".p2align              4            \n\t"\
492
    "1:                                 \n\t"\
493
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
494
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
495
    "psraw                $7, %%mm3     \n\t" \
496
    "psraw                $7, %%mm4     \n\t" \
497
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
498
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
499
    "psraw                $7, %%mm1     \n\t" \
500
    "psraw                $7, %%mm7     \n\t" \
501

    
502
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
503

    
504
#define REAL_YSCALEYUV2RGB1(index, c) \
505
    "xor            "#index", "#index"  \n\t"\
506
    ".p2align              4            \n\t"\
507
    "1:                                 \n\t"\
508
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
509
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
510
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
511
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
512
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
513
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
514
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
515
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
516
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
517
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
518
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
519
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
520
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
521
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
522
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
523
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
524
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
525
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
526
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
527
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
528
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
529
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
530
    "paddw             %%mm3, %%mm4     \n\t"\
531
    "movq              %%mm2, %%mm0     \n\t"\
532
    "movq              %%mm5, %%mm6     \n\t"\
533
    "movq              %%mm4, %%mm3     \n\t"\
534
    "punpcklwd         %%mm2, %%mm2     \n\t"\
535
    "punpcklwd         %%mm5, %%mm5     \n\t"\
536
    "punpcklwd         %%mm4, %%mm4     \n\t"\
537
    "paddw             %%mm1, %%mm2     \n\t"\
538
    "paddw             %%mm1, %%mm5     \n\t"\
539
    "paddw             %%mm1, %%mm4     \n\t"\
540
    "punpckhwd         %%mm0, %%mm0     \n\t"\
541
    "punpckhwd         %%mm6, %%mm6     \n\t"\
542
    "punpckhwd         %%mm3, %%mm3     \n\t"\
543
    "paddw             %%mm7, %%mm0     \n\t"\
544
    "paddw             %%mm7, %%mm6     \n\t"\
545
    "paddw             %%mm7, %%mm3     \n\t"\
546
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
547
    "packuswb          %%mm0, %%mm2     \n\t"\
548
    "packuswb          %%mm6, %%mm5     \n\t"\
549
    "packuswb          %%mm3, %%mm4     \n\t"\
550

    
551
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
552

    
553
#define REAL_YSCALEYUV2PACKED1b(index, c) \
554
    "xor "#index", "#index"             \n\t"\
555
    ".p2align              4            \n\t"\
556
    "1:                                 \n\t"\
557
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
558
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
559
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
560
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
561
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
562
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
563
    "psrlw                $8, %%mm3     \n\t" \
564
    "psrlw                $8, %%mm4     \n\t" \
565
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
566
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
567
    "psraw                $7, %%mm1     \n\t" \
568
    "psraw                $7, %%mm7     \n\t"
569
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
570

    
571
// do vertical chrominance interpolation
572
#define REAL_YSCALEYUV2RGB1b(index, c) \
573
    "xor            "#index", "#index"  \n\t"\
574
    ".p2align              4            \n\t"\
575
    "1:                                 \n\t"\
576
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
577
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
578
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
579
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
580
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
581
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
582
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
583
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
584
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
585
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
586
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
587
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
588
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
589
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
590
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
591
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
592
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
593
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
594
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
595
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
596
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
597
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
598
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
599
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
600
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
601
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
602
    "paddw             %%mm3, %%mm4     \n\t"\
603
    "movq              %%mm2, %%mm0     \n\t"\
604
    "movq              %%mm5, %%mm6     \n\t"\
605
    "movq              %%mm4, %%mm3     \n\t"\
606
    "punpcklwd         %%mm2, %%mm2     \n\t"\
607
    "punpcklwd         %%mm5, %%mm5     \n\t"\
608
    "punpcklwd         %%mm4, %%mm4     \n\t"\
609
    "paddw             %%mm1, %%mm2     \n\t"\
610
    "paddw             %%mm1, %%mm5     \n\t"\
611
    "paddw             %%mm1, %%mm4     \n\t"\
612
    "punpckhwd         %%mm0, %%mm0     \n\t"\
613
    "punpckhwd         %%mm6, %%mm6     \n\t"\
614
    "punpckhwd         %%mm3, %%mm3     \n\t"\
615
    "paddw             %%mm7, %%mm0     \n\t"\
616
    "paddw             %%mm7, %%mm6     \n\t"\
617
    "paddw             %%mm7, %%mm3     \n\t"\
618
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
619
    "packuswb          %%mm0, %%mm2     \n\t"\
620
    "packuswb          %%mm6, %%mm5     \n\t"\
621
    "packuswb          %%mm3, %%mm4     \n\t"\
622

    
623
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
624

    
625
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
626
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
627
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
628
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
629
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
630
    "packuswb          %%mm1, %%mm7     \n\t"
631
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
632

    
633
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
634
    "movq       "#b", "#q2"     \n\t" /* B */\
635
    "movq       "#r", "#t"      \n\t" /* R */\
636
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
637
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
638
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
639
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
640
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
641
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
642
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
643
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
644
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
645
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
646
\
647
    MOVNTQ(   q0,   (dst, index, 4))\
648
    MOVNTQ(    b,  8(dst, index, 4))\
649
    MOVNTQ(   q2, 16(dst, index, 4))\
650
    MOVNTQ(   q3, 24(dst, index, 4))\
651
\
652
    "add      $8, "#index"      \n\t"\
653
    "cmp "#dstw", "#index"      \n\t"\
654
    " jb      1b                \n\t"
655
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
656

    
657
#define REAL_WRITERGB16(dst, dstw, index) \
658
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
659
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
660
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
661
    "psrlq           $3, %%mm2  \n\t"\
662
\
663
    "movq         %%mm2, %%mm1  \n\t"\
664
    "movq         %%mm4, %%mm3  \n\t"\
665
\
666
    "punpcklbw    %%mm7, %%mm3  \n\t"\
667
    "punpcklbw    %%mm5, %%mm2  \n\t"\
668
    "punpckhbw    %%mm7, %%mm4  \n\t"\
669
    "punpckhbw    %%mm5, %%mm1  \n\t"\
670
\
671
    "psllq           $3, %%mm3  \n\t"\
672
    "psllq           $3, %%mm4  \n\t"\
673
\
674
    "por          %%mm3, %%mm2  \n\t"\
675
    "por          %%mm4, %%mm1  \n\t"\
676
\
677
    MOVNTQ(%%mm2,  (dst, index, 2))\
678
    MOVNTQ(%%mm1, 8(dst, index, 2))\
679
\
680
    "add             $8, "#index"   \n\t"\
681
    "cmp        "#dstw", "#index"   \n\t"\
682
    " jb             1b             \n\t"
683
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
684

    
685
#define REAL_WRITERGB15(dst, dstw, index) \
686
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
687
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
688
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
689
    "psrlq           $3, %%mm2  \n\t"\
690
    "psrlq           $1, %%mm5  \n\t"\
691
\
692
    "movq         %%mm2, %%mm1  \n\t"\
693
    "movq         %%mm4, %%mm3  \n\t"\
694
\
695
    "punpcklbw    %%mm7, %%mm3  \n\t"\
696
    "punpcklbw    %%mm5, %%mm2  \n\t"\
697
    "punpckhbw    %%mm7, %%mm4  \n\t"\
698
    "punpckhbw    %%mm5, %%mm1  \n\t"\
699
\
700
    "psllq           $2, %%mm3  \n\t"\
701
    "psllq           $2, %%mm4  \n\t"\
702
\
703
    "por          %%mm3, %%mm2  \n\t"\
704
    "por          %%mm4, %%mm1  \n\t"\
705
\
706
    MOVNTQ(%%mm2,  (dst, index, 2))\
707
    MOVNTQ(%%mm1, 8(dst, index, 2))\
708
\
709
    "add             $8, "#index"   \n\t"\
710
    "cmp        "#dstw", "#index"   \n\t"\
711
    " jb             1b             \n\t"
712
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
713

    
714
#define WRITEBGR24OLD(dst, dstw, index) \
715
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
716
    "movq      %%mm2, %%mm1             \n\t" /* B */\
717
    "movq      %%mm5, %%mm6             \n\t" /* R */\
718
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
719
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
720
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
721
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
722
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
723
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
724
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
725
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
726
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
727
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
728
\
729
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
730
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
731
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
732
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
733
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
734
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
735
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
736
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
737
\
738
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
739
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
740
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
741
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
742
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
743
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
744
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
745
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
746
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
747
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
748
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
749
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
750
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
751
\
752
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
753
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
754
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
755
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
756
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
757
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
758
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
759
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
760
\
761
    MOVNTQ(%%mm0,   (dst))\
762
    MOVNTQ(%%mm2,  8(dst))\
763
    MOVNTQ(%%mm3, 16(dst))\
764
    "add         $24, "#dst"            \n\t"\
765
\
766
    "add          $8, "#index"          \n\t"\
767
    "cmp     "#dstw", "#index"          \n\t"\
768
    " jb          1b                    \n\t"
769

    
770
#define WRITEBGR24MMX(dst, dstw, index) \
771
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
772
    "movq      %%mm2, %%mm1     \n\t" /* B */\
773
    "movq      %%mm5, %%mm6     \n\t" /* R */\
774
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
775
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
776
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
777
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
778
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
779
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
780
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
781
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
782
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
783
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
784
\
785
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
786
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
787
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
788
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
789
\
790
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
791
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
792
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
793
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
794
\
795
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
796
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
797
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
798
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
799
\
800
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
801
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
802
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
803
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
804
    MOVNTQ(%%mm0, (dst))\
805
\
806
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
807
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
808
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
809
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
810
    MOVNTQ(%%mm6, 8(dst))\
811
\
812
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
813
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
814
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
815
    MOVNTQ(%%mm5, 16(dst))\
816
\
817
    "add         $24, "#dst"    \n\t"\
818
\
819
    "add          $8, "#index"  \n\t"\
820
    "cmp     "#dstw", "#index"  \n\t"\
821
    " jb          1b            \n\t"
822

    
823
#define WRITEBGR24MMX2(dst, dstw, index) \
824
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
825
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
826
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
827
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
828
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
829
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
830
\
831
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
832
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
833
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
834
\
835
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
836
    "por    %%mm1, %%mm6        \n\t"\
837
    "por    %%mm3, %%mm6        \n\t"\
838
    MOVNTQ(%%mm6, (dst))\
839
\
840
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
841
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
842
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
843
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
844
\
845
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
846
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
847
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
848
\
849
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
850
    "por    %%mm3, %%mm6        \n\t"\
851
    MOVNTQ(%%mm6, 8(dst))\
852
\
853
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
854
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
855
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
856
\
857
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
858
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
859
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
860
\
861
    "por    %%mm1, %%mm3        \n\t"\
862
    "por    %%mm3, %%mm6        \n\t"\
863
    MOVNTQ(%%mm6, 16(dst))\
864
\
865
    "add      $24, "#dst"       \n\t"\
866
\
867
    "add       $8, "#index"     \n\t"\
868
    "cmp  "#dstw", "#index"     \n\t"\
869
    " jb       1b               \n\t"
870

    
871
#if COMPILE_TEMPLATE_MMX2
872
#undef WRITEBGR24
873
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
874
#else
875
#undef WRITEBGR24
876
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
877
#endif
878

    
879
#define REAL_WRITEYUY2(dst, dstw, index) \
880
    "packuswb  %%mm3, %%mm3     \n\t"\
881
    "packuswb  %%mm4, %%mm4     \n\t"\
882
    "packuswb  %%mm7, %%mm1     \n\t"\
883
    "punpcklbw %%mm4, %%mm3     \n\t"\
884
    "movq      %%mm1, %%mm7     \n\t"\
885
    "punpcklbw %%mm3, %%mm1     \n\t"\
886
    "punpckhbw %%mm3, %%mm7     \n\t"\
887
\
888
    MOVNTQ(%%mm1, (dst, index, 2))\
889
    MOVNTQ(%%mm7, 8(dst, index, 2))\
890
\
891
    "add          $8, "#index"  \n\t"\
892
    "cmp     "#dstw", "#index"  \n\t"\
893
    " jb          1b            \n\t"
894
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
895

    
896

    
897
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
898
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
899
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
900
{
901
#if COMPILE_TEMPLATE_MMX
902
    if(!(c->flags & SWS_BITEXACT)) {
903
        if (c->flags & SWS_ACCURATE_RND) {
904
            if (uDest) {
905
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
906
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
907
            }
908
            if (CONFIG_SWSCALE_ALPHA && aDest) {
909
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
910
            }
911

    
912
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
913
        } else {
914
            if (uDest) {
915
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
916
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
917
            }
918
            if (CONFIG_SWSCALE_ALPHA && aDest) {
919
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
920
            }
921

    
922
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
923
        }
924
        return;
925
    }
926
#endif
927
#if COMPILE_TEMPLATE_ALTIVEC
928
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
929
                          chrFilter, chrSrc, chrFilterSize,
930
                          dest, uDest, vDest, dstW, chrDstW);
931
#else //COMPILE_TEMPLATE_ALTIVEC
932
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
933
                chrFilter, chrSrc, chrFilterSize,
934
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
935
#endif //!COMPILE_TEMPLATE_ALTIVEC
936
}
937

    
938
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
939
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
940
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
941
{
942
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
943
                 chrFilter, chrSrc, chrFilterSize,
944
                 dest, uDest, dstW, chrDstW, dstFormat);
945
}
946

    
947
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
948
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
949
{
950
    int i;
951
#if COMPILE_TEMPLATE_MMX
952
    if(!(c->flags & SWS_BITEXACT)) {
953
        long p= 4;
954
        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
955
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
956
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
957

    
958
        if (c->flags & SWS_ACCURATE_RND) {
959
            while(p--) {
960
                if (dst[p]) {
961
                    __asm__ volatile(
962
                        YSCALEYUV2YV121_ACCURATE
963
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
964
                        "g" (-counter[p])
965
                        : "%"REG_a
966
                    );
967
                }
968
            }
969
        } else {
970
            while(p--) {
971
                if (dst[p]) {
972
                    __asm__ volatile(
973
                        YSCALEYUV2YV121
974
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
975
                        "g" (-counter[p])
976
                        : "%"REG_a
977
                    );
978
                }
979
            }
980
        }
981
        return;
982
    }
983
#endif
984
    for (i=0; i<dstW; i++) {
985
        int val= (lumSrc[i]+64)>>7;
986

    
987
        if (val&256) {
988
            if (val<0) val=0;
989
            else       val=255;
990
        }
991

    
992
        dest[i]= val;
993
    }
994

    
995
    if (uDest)
996
        for (i=0; i<chrDstW; i++) {
997
            int u=(chrSrc[i       ]+64)>>7;
998
            int v=(chrSrc[i + VOFW]+64)>>7;
999

    
1000
            if ((u|v)&256) {
1001
                if (u<0)        u=0;
1002
                else if (u>255) u=255;
1003
                if (v<0)        v=0;
1004
                else if (v>255) v=255;
1005
            }
1006

    
1007
            uDest[i]= u;
1008
            vDest[i]= v;
1009
        }
1010

    
1011
    if (CONFIG_SWSCALE_ALPHA && aDest)
1012
        for (i=0; i<dstW; i++) {
1013
            int val= (alpSrc[i]+64)>>7;
1014
            aDest[i]= av_clip_uint8(val);
1015
        }
1016
}
1017

    
1018

    
1019
/**
1020
 * vertical scale YV12 to RGB
1021
 */
1022
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1023
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1024
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1025
{
1026
#if COMPILE_TEMPLATE_MMX
1027
    x86_reg dummy=0;
1028
    x86_reg dstW_reg = dstW;
1029
    if(!(c->flags & SWS_BITEXACT)) {
1030
        if (c->flags & SWS_ACCURATE_RND) {
1031
            switch(c->dstFormat) {
1032
            case PIX_FMT_RGB32:
1033
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1034
                    YSCALEYUV2PACKEDX_ACCURATE
1035
                    YSCALEYUV2RGBX
1036
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1037
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1038
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1039
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1040
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1041
                    "psraw                        $3, %%mm1         \n\t"
1042
                    "psraw                        $3, %%mm7         \n\t"
1043
                    "packuswb                  %%mm7, %%mm1         \n\t"
1044
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1045

    
1046
                    YSCALEYUV2PACKEDX_END
1047
                } else {
1048
                    YSCALEYUV2PACKEDX_ACCURATE
1049
                    YSCALEYUV2RGBX
1050
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1051
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1052

    
1053
                    YSCALEYUV2PACKEDX_END
1054
                }
1055
                return;
1056
            case PIX_FMT_BGR24:
1057
                YSCALEYUV2PACKEDX_ACCURATE
1058
                YSCALEYUV2RGBX
1059
                "pxor %%mm7, %%mm7 \n\t"
1060
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1061
                "add %4, %%"REG_c"                        \n\t"
1062
                WRITEBGR24(%%REGc, %5, %%REGa)
1063

    
1064

    
1065
                :: "r" (&c->redDither),
1066
                "m" (dummy), "m" (dummy), "m" (dummy),
1067
                "r" (dest), "m" (dstW_reg)
1068
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1069
                );
1070
                return;
1071
            case PIX_FMT_RGB555:
1072
                YSCALEYUV2PACKEDX_ACCURATE
1073
                YSCALEYUV2RGBX
1074
                "pxor %%mm7, %%mm7 \n\t"
1075
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1076
#ifdef DITHER1XBPP
1077
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1078
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1079
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1080
#endif
1081

    
1082
                WRITERGB15(%4, %5, %%REGa)
1083
                YSCALEYUV2PACKEDX_END
1084
                return;
1085
            case PIX_FMT_RGB565:
1086
                YSCALEYUV2PACKEDX_ACCURATE
1087
                YSCALEYUV2RGBX
1088
                "pxor %%mm7, %%mm7 \n\t"
1089
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1090
#ifdef DITHER1XBPP
1091
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1092
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1093
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1094
#endif
1095

    
1096
                WRITERGB16(%4, %5, %%REGa)
1097
                YSCALEYUV2PACKEDX_END
1098
                return;
1099
            case PIX_FMT_YUYV422:
1100
                YSCALEYUV2PACKEDX_ACCURATE
1101
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102

    
1103
                "psraw $3, %%mm3    \n\t"
1104
                "psraw $3, %%mm4    \n\t"
1105
                "psraw $3, %%mm1    \n\t"
1106
                "psraw $3, %%mm7    \n\t"
1107
                WRITEYUY2(%4, %5, %%REGa)
1108
                YSCALEYUV2PACKEDX_END
1109
                return;
1110
            }
1111
        } else {
1112
            switch(c->dstFormat) {
1113
            case PIX_FMT_RGB32:
1114
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1115
                    YSCALEYUV2PACKEDX
1116
                    YSCALEYUV2RGBX
1117
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1118
                    "psraw                        $3, %%mm1         \n\t"
1119
                    "psraw                        $3, %%mm7         \n\t"
1120
                    "packuswb                  %%mm7, %%mm1         \n\t"
1121
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1122
                    YSCALEYUV2PACKEDX_END
1123
                } else {
1124
                    YSCALEYUV2PACKEDX
1125
                    YSCALEYUV2RGBX
1126
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1127
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1128
                    YSCALEYUV2PACKEDX_END
1129
                }
1130
                return;
1131
            case PIX_FMT_BGR24:
1132
                YSCALEYUV2PACKEDX
1133
                YSCALEYUV2RGBX
1134
                "pxor                    %%mm7, %%mm7       \n\t"
1135
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
                "add                        %4, %%"REG_c"   \n\t"
1137
                WRITEBGR24(%%REGc, %5, %%REGa)
1138

    
1139
                :: "r" (&c->redDither),
1140
                "m" (dummy), "m" (dummy), "m" (dummy),
1141
                "r" (dest),  "m" (dstW_reg)
1142
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
                );
1144
                return;
1145
            case PIX_FMT_RGB555:
1146
                YSCALEYUV2PACKEDX
1147
                YSCALEYUV2RGBX
1148
                "pxor %%mm7, %%mm7 \n\t"
1149
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1150
#ifdef DITHER1XBPP
1151
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1152
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1153
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1154
#endif
1155

    
1156
                WRITERGB15(%4, %5, %%REGa)
1157
                YSCALEYUV2PACKEDX_END
1158
                return;
1159
            case PIX_FMT_RGB565:
1160
                YSCALEYUV2PACKEDX
1161
                YSCALEYUV2RGBX
1162
                "pxor %%mm7, %%mm7 \n\t"
1163
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1164
#ifdef DITHER1XBPP
1165
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1166
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1167
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1168
#endif
1169

    
1170
                WRITERGB16(%4, %5, %%REGa)
1171
                YSCALEYUV2PACKEDX_END
1172
                return;
1173
            case PIX_FMT_YUYV422:
1174
                YSCALEYUV2PACKEDX
1175
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1176

    
1177
                "psraw $3, %%mm3    \n\t"
1178
                "psraw $3, %%mm4    \n\t"
1179
                "psraw $3, %%mm1    \n\t"
1180
                "psraw $3, %%mm7    \n\t"
1181
                WRITEYUY2(%4, %5, %%REGa)
1182
                YSCALEYUV2PACKEDX_END
1183
                return;
1184
            }
1185
        }
1186
    }
1187
#endif /* COMPILE_TEMPLATE_MMX */
1188
#if COMPILE_TEMPLATE_ALTIVEC
1189
    /* The following list of supported dstFormat values should
1190
       match what's found in the body of ff_yuv2packedX_altivec() */
1191
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1192
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1193
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1194
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1195
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1196
                                   chrFilter, chrSrc, chrFilterSize,
1197
                                   dest, dstW, dstY);
1198
    else
1199
#endif
1200
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1201
                       chrFilter, chrSrc, chrFilterSize,
1202
                       alpSrc, dest, dstW, dstY);
1203
}
1204

    
1205
/**
1206
 * vertical bilinear scale YV12 to RGB
1207
 */
1208
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1209
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1210
{
1211
    int  yalpha1=4095- yalpha;
1212
    int uvalpha1=4095-uvalpha;
1213
    int i;
1214

    
1215
#if COMPILE_TEMPLATE_MMX
1216
    if(!(c->flags & SWS_BITEXACT)) {
1217
        switch(c->dstFormat) {
1218
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1219
        case PIX_FMT_RGB32:
1220
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1221
#if ARCH_X86_64
1222
                __asm__ volatile(
1223
                    YSCALEYUV2RGB(%%r8, %5)
1224
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1225
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1226
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1227
                    "packuswb            %%mm7, %%mm1       \n\t"
1228
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1229

    
1230
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1231
                    "a" (&c->redDither)
1232
                    ,"r" (abuf0), "r" (abuf1)
1233
                    : "%r8"
1234
                );
1235
#else
1236
                *(const uint16_t **)(&c->u_temp)=abuf0;
1237
                *(const uint16_t **)(&c->v_temp)=abuf1;
1238
                __asm__ volatile(
1239
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1240
                    "mov        %4, %%"REG_b"               \n\t"
1241
                    "push %%"REG_BP"                        \n\t"
1242
                    YSCALEYUV2RGB(%%REGBP, %5)
1243
                    "push                   %0              \n\t"
1244
                    "push                   %1              \n\t"
1245
                    "mov          "U_TEMP"(%5), %0          \n\t"
1246
                    "mov          "V_TEMP"(%5), %1          \n\t"
1247
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1248
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1249
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1250
                    "packuswb            %%mm7, %%mm1       \n\t"
1251
                    "pop                    %1              \n\t"
1252
                    "pop                    %0              \n\t"
1253
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1254
                    "pop %%"REG_BP"                         \n\t"
1255
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1256

    
1257
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1258
                    "a" (&c->redDither)
1259
                );
1260
#endif
1261
            } else {
1262
                __asm__ volatile(
1263
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1264
                    "mov        %4, %%"REG_b"               \n\t"
1265
                    "push %%"REG_BP"                        \n\t"
1266
                    YSCALEYUV2RGB(%%REGBP, %5)
1267
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1268
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1269
                    "pop %%"REG_BP"                         \n\t"
1270
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1271

    
1272
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1273
                    "a" (&c->redDither)
1274
                );
1275
            }
1276
            return;
1277
        case PIX_FMT_BGR24:
1278
            __asm__ volatile(
1279
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1280
                "mov        %4, %%"REG_b"               \n\t"
1281
                "push %%"REG_BP"                        \n\t"
1282
                YSCALEYUV2RGB(%%REGBP, %5)
1283
                "pxor    %%mm7, %%mm7                   \n\t"
1284
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1285
                "pop %%"REG_BP"                         \n\t"
1286
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1287
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1288
                "a" (&c->redDither)
1289
            );
1290
            return;
1291
        case PIX_FMT_RGB555:
1292
            __asm__ volatile(
1293
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1294
                "mov        %4, %%"REG_b"               \n\t"
1295
                "push %%"REG_BP"                        \n\t"
1296
                YSCALEYUV2RGB(%%REGBP, %5)
1297
                "pxor    %%mm7, %%mm7                   \n\t"
1298
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1299
#ifdef DITHER1XBPP
1300
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1301
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1302
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1303
#endif
1304

    
1305
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1306
                "pop %%"REG_BP"                         \n\t"
1307
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1308

    
1309
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1310
                "a" (&c->redDither)
1311
            );
1312
            return;
1313
        case PIX_FMT_RGB565:
1314
            __asm__ volatile(
1315
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1316
                "mov        %4, %%"REG_b"               \n\t"
1317
                "push %%"REG_BP"                        \n\t"
1318
                YSCALEYUV2RGB(%%REGBP, %5)
1319
                "pxor    %%mm7, %%mm7                   \n\t"
1320
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1321
#ifdef DITHER1XBPP
1322
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1323
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1324
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1325
#endif
1326

    
1327
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1328
                "pop %%"REG_BP"                         \n\t"
1329
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1330
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1331
                "a" (&c->redDither)
1332
            );
1333
            return;
1334
        case PIX_FMT_YUYV422:
1335
            __asm__ volatile(
1336
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1337
                "mov %4, %%"REG_b"                        \n\t"
1338
                "push %%"REG_BP"                        \n\t"
1339
                YSCALEYUV2PACKED(%%REGBP, %5)
1340
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1341
                "pop %%"REG_BP"                         \n\t"
1342
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1343
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1344
                "a" (&c->redDither)
1345
            );
1346
            return;
1347
        default: break;
1348
        }
1349
    }
1350
#endif //COMPILE_TEMPLATE_MMX
1351
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1352
}
1353

    
1354
/**
1355
 * YV12 to RGB without scaling or interpolating
1356
 */
1357
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1358
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1359
{
1360
    const int yalpha1=0;
1361
    int i;
1362

    
1363
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1364
    const int yalpha= 4096; //FIXME ...
1365

    
1366
    if (flags&SWS_FULL_CHR_H_INT) {
1367
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1368
        return;
1369
    }
1370

    
1371
#if COMPILE_TEMPLATE_MMX
1372
    if(!(flags & SWS_BITEXACT)) {
1373
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1374
            switch(dstFormat) {
1375
            case PIX_FMT_RGB32:
1376
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1377
                    __asm__ volatile(
1378
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1379
                        "mov        %4, %%"REG_b"               \n\t"
1380
                        "push %%"REG_BP"                        \n\t"
1381
                        YSCALEYUV2RGB1(%%REGBP, %5)
1382
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1383
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1384
                        "pop %%"REG_BP"                         \n\t"
1385
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1386

    
1387
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1388
                        "a" (&c->redDither)
1389
                    );
1390
                } else {
1391
                    __asm__ volatile(
1392
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1393
                        "mov        %4, %%"REG_b"               \n\t"
1394
                        "push %%"REG_BP"                        \n\t"
1395
                        YSCALEYUV2RGB1(%%REGBP, %5)
1396
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1397
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1398
                        "pop %%"REG_BP"                         \n\t"
1399
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1400

    
1401
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1402
                        "a" (&c->redDither)
1403
                    );
1404
                }
1405
                return;
1406
            case PIX_FMT_BGR24:
1407
                __asm__ volatile(
1408
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1409
                    "mov        %4, %%"REG_b"               \n\t"
1410
                    "push %%"REG_BP"                        \n\t"
1411
                    YSCALEYUV2RGB1(%%REGBP, %5)
1412
                    "pxor    %%mm7, %%mm7                   \n\t"
1413
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1414
                    "pop %%"REG_BP"                         \n\t"
1415
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1416

    
1417
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1418
                    "a" (&c->redDither)
1419
                );
1420
                return;
1421
            case PIX_FMT_RGB555:
1422
                __asm__ volatile(
1423
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1424
                    "mov        %4, %%"REG_b"               \n\t"
1425
                    "push %%"REG_BP"                        \n\t"
1426
                    YSCALEYUV2RGB1(%%REGBP, %5)
1427
                    "pxor    %%mm7, %%mm7                   \n\t"
1428
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1429
#ifdef DITHER1XBPP
1430
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1431
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1432
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1433
#endif
1434
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1435
                    "pop %%"REG_BP"                         \n\t"
1436
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1437

    
1438
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1439
                    "a" (&c->redDither)
1440
                );
1441
                return;
1442
            case PIX_FMT_RGB565:
1443
                __asm__ volatile(
1444
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1445
                    "mov        %4, %%"REG_b"               \n\t"
1446
                    "push %%"REG_BP"                        \n\t"
1447
                    YSCALEYUV2RGB1(%%REGBP, %5)
1448
                    "pxor    %%mm7, %%mm7                   \n\t"
1449
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1450
#ifdef DITHER1XBPP
1451
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1452
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1453
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1454
#endif
1455

    
1456
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1457
                    "pop %%"REG_BP"                         \n\t"
1458
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1459

    
1460
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1461
                    "a" (&c->redDither)
1462
                );
1463
                return;
1464
            case PIX_FMT_YUYV422:
1465
                __asm__ volatile(
1466
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1467
                    "mov        %4, %%"REG_b"               \n\t"
1468
                    "push %%"REG_BP"                        \n\t"
1469
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1470
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1471
                    "pop %%"REG_BP"                         \n\t"
1472
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473

    
1474
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                    "a" (&c->redDither)
1476
                );
1477
                return;
1478
            }
1479
        } else {
1480
            switch(dstFormat) {
1481
            case PIX_FMT_RGB32:
1482
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1483
                    __asm__ volatile(
1484
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1485
                        "mov        %4, %%"REG_b"               \n\t"
1486
                        "push %%"REG_BP"                        \n\t"
1487
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1488
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1489
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1490
                        "pop %%"REG_BP"                         \n\t"
1491
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1492

    
1493
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1494
                        "a" (&c->redDither)
1495
                    );
1496
                } else {
1497
                    __asm__ volatile(
1498
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1499
                        "mov        %4, %%"REG_b"               \n\t"
1500
                        "push %%"REG_BP"                        \n\t"
1501
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1502
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1503
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1504
                        "pop %%"REG_BP"                         \n\t"
1505
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1506

    
1507
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508
                        "a" (&c->redDither)
1509
                    );
1510
                }
1511
                return;
1512
            case PIX_FMT_BGR24:
1513
                __asm__ volatile(
1514
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515
                    "mov        %4, %%"REG_b"               \n\t"
1516
                    "push %%"REG_BP"                        \n\t"
1517
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1518
                    "pxor    %%mm7, %%mm7                   \n\t"
1519
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1520
                    "pop %%"REG_BP"                         \n\t"
1521
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522

    
1523
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524
                    "a" (&c->redDither)
1525
                );
1526
                return;
1527
            case PIX_FMT_RGB555:
1528
                __asm__ volatile(
1529
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1530
                    "mov        %4, %%"REG_b"               \n\t"
1531
                    "push %%"REG_BP"                        \n\t"
1532
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1533
                    "pxor    %%mm7, %%mm7                   \n\t"
1534
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1535
#ifdef DITHER1XBPP
1536
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1537
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1538
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1539
#endif
1540
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1541
                    "pop %%"REG_BP"                         \n\t"
1542
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543

    
1544
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545
                    "a" (&c->redDither)
1546
                );
1547
                return;
1548
            case PIX_FMT_RGB565:
1549
                __asm__ volatile(
1550
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
                    "mov        %4, %%"REG_b"               \n\t"
1552
                    "push %%"REG_BP"                        \n\t"
1553
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1554
                    "pxor    %%mm7, %%mm7                   \n\t"
1555
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1556
#ifdef DITHER1XBPP
1557
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1558
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1559
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1560
#endif
1561

    
1562
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1563
                    "pop %%"REG_BP"                         \n\t"
1564
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1565

    
1566
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1567
                    "a" (&c->redDither)
1568
                );
1569
                return;
1570
            case PIX_FMT_YUYV422:
1571
                __asm__ volatile(
1572
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1573
                    "mov        %4, %%"REG_b"               \n\t"
1574
                    "push %%"REG_BP"                        \n\t"
1575
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1576
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1577
                    "pop %%"REG_BP"                         \n\t"
1578
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1579

    
1580
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1581
                    "a" (&c->redDither)
1582
                );
1583
                return;
1584
            }
1585
        }
1586
    }
1587
#endif /* COMPILE_TEMPLATE_MMX */
1588
    if (uvalpha < 2048) {
1589
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1590
    } else {
1591
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592
    }
1593
}
1594

    
1595
//FIXME yuy2* can read up to 7 samples too much
1596

    
1597
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1598
{
1599
#if COMPILE_TEMPLATE_MMX
1600
    __asm__ volatile(
1601
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1602
        "mov                    %0, %%"REG_a"       \n\t"
1603
        "1:                                         \n\t"
1604
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1605
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1606
        "pand                %%mm2, %%mm0           \n\t"
1607
        "pand                %%mm2, %%mm1           \n\t"
1608
        "packuswb            %%mm1, %%mm0           \n\t"
1609
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1610
        "add                    $8, %%"REG_a"       \n\t"
1611
        " js                    1b                  \n\t"
1612
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1613
        : "%"REG_a
1614
    );
1615
#else
1616
    int i;
1617
    for (i=0; i<width; i++)
1618
        dst[i]= src[2*i];
1619
#endif
1620
}
1621

    
1622
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1623
{
1624
#if COMPILE_TEMPLATE_MMX
1625
    __asm__ volatile(
1626
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1627
        "mov                    %0, %%"REG_a"       \n\t"
1628
        "1:                                         \n\t"
1629
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1630
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1631
        "psrlw                  $8, %%mm0           \n\t"
1632
        "psrlw                  $8, %%mm1           \n\t"
1633
        "packuswb            %%mm1, %%mm0           \n\t"
1634
        "movq                %%mm0, %%mm1           \n\t"
1635
        "psrlw                  $8, %%mm0           \n\t"
1636
        "pand                %%mm4, %%mm1           \n\t"
1637
        "packuswb            %%mm0, %%mm0           \n\t"
1638
        "packuswb            %%mm1, %%mm1           \n\t"
1639
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1640
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1641
        "add                    $4, %%"REG_a"       \n\t"
1642
        " js                    1b                  \n\t"
1643
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1644
        : "%"REG_a
1645
    );
1646
#else
1647
    int i;
1648
    for (i=0; i<width; i++) {
1649
        dstU[i]= src1[4*i + 1];
1650
        dstV[i]= src1[4*i + 3];
1651
    }
1652
#endif
1653
    assert(src1 == src2);
1654
}
1655

    
1656
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1657
{
1658
#if COMPILE_TEMPLATE_MMX
1659
    __asm__ volatile(
1660
        "mov                    %0, %%"REG_a"       \n\t"
1661
        "1:                                         \n\t"
1662
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1663
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1664
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1665
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1666
        "psrlw                  $8, %%mm0           \n\t"
1667
        "psrlw                  $8, %%mm1           \n\t"
1668
        "psrlw                  $8, %%mm2           \n\t"
1669
        "psrlw                  $8, %%mm3           \n\t"
1670
        "packuswb            %%mm1, %%mm0           \n\t"
1671
        "packuswb            %%mm3, %%mm2           \n\t"
1672
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1673
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1674
        "add                    $8, %%"REG_a"       \n\t"
1675
        " js                    1b                  \n\t"
1676
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1677
        : "%"REG_a
1678
    );
1679
#else
1680
    int i;
1681
    for (i=0; i<width; i++) {
1682
        dstU[i]= src1[2*i + 1];
1683
        dstV[i]= src2[2*i + 1];
1684
    }
1685
#endif
1686
}
1687

    
1688
/* This is almost identical to the previous, end exists only because
1689
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1690
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1691
{
1692
#if COMPILE_TEMPLATE_MMX
1693
    __asm__ volatile(
1694
        "mov                  %0, %%"REG_a"         \n\t"
1695
        "1:                                         \n\t"
1696
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1697
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1698
        "psrlw                $8, %%mm0             \n\t"
1699
        "psrlw                $8, %%mm1             \n\t"
1700
        "packuswb          %%mm1, %%mm0             \n\t"
1701
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1702
        "add                  $8, %%"REG_a"         \n\t"
1703
        " js                  1b                    \n\t"
1704
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1705
        : "%"REG_a
1706
    );
1707
#else
1708
    int i;
1709
    for (i=0; i<width; i++)
1710
        dst[i]= src[2*i+1];
1711
#endif
1712
}
1713

    
1714
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1715
{
1716
#if COMPILE_TEMPLATE_MMX
1717
    __asm__ volatile(
1718
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1719
        "mov                    %0, %%"REG_a"       \n\t"
1720
        "1:                                         \n\t"
1721
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1722
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1723
        "pand                %%mm4, %%mm0           \n\t"
1724
        "pand                %%mm4, %%mm1           \n\t"
1725
        "packuswb            %%mm1, %%mm0           \n\t"
1726
        "movq                %%mm0, %%mm1           \n\t"
1727
        "psrlw                  $8, %%mm0           \n\t"
1728
        "pand                %%mm4, %%mm1           \n\t"
1729
        "packuswb            %%mm0, %%mm0           \n\t"
1730
        "packuswb            %%mm1, %%mm1           \n\t"
1731
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1732
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1733
        "add                    $4, %%"REG_a"       \n\t"
1734
        " js                    1b                  \n\t"
1735
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1736
        : "%"REG_a
1737
    );
1738
#else
1739
    int i;
1740
    for (i=0; i<width; i++) {
1741
        dstU[i]= src1[4*i + 0];
1742
        dstV[i]= src1[4*i + 2];
1743
    }
1744
#endif
1745
    assert(src1 == src2);
1746
}
1747

    
1748
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1749
{
1750
#if COMPILE_TEMPLATE_MMX
1751
    __asm__ volatile(
1752
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1753
        "mov                    %0, %%"REG_a"       \n\t"
1754
        "1:                                         \n\t"
1755
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1756
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1757
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1758
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1759
        "pand                %%mm4, %%mm0           \n\t"
1760
        "pand                %%mm4, %%mm1           \n\t"
1761
        "pand                %%mm4, %%mm2           \n\t"
1762
        "pand                %%mm4, %%mm3           \n\t"
1763
        "packuswb            %%mm1, %%mm0           \n\t"
1764
        "packuswb            %%mm3, %%mm2           \n\t"
1765
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1766
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1767
        "add                    $8, %%"REG_a"       \n\t"
1768
        " js                    1b                  \n\t"
1769
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1770
        : "%"REG_a
1771
    );
1772
#else
1773
    int i;
1774
    for (i=0; i<width; i++) {
1775
        dstU[i]= src1[2*i];
1776
        dstV[i]= src2[2*i];
1777
    }
1778
#endif
1779
}
1780

    
1781
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1782
                                    const uint8_t *src, long width)
1783
{
1784
#if COMPILE_TEMPLATE_MMX
1785
    __asm__ volatile(
1786
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1787
        "mov                    %0, %%"REG_a"       \n\t"
1788
        "1:                                         \n\t"
1789
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1790
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1791
        "movq                %%mm0, %%mm2           \n\t"
1792
        "movq                %%mm1, %%mm3           \n\t"
1793
        "pand                %%mm4, %%mm0           \n\t"
1794
        "pand                %%mm4, %%mm1           \n\t"
1795
        "psrlw                  $8, %%mm2           \n\t"
1796
        "psrlw                  $8, %%mm3           \n\t"
1797
        "packuswb            %%mm1, %%mm0           \n\t"
1798
        "packuswb            %%mm3, %%mm2           \n\t"
1799
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1800
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1801
        "add                    $8, %%"REG_a"       \n\t"
1802
        " js                    1b                  \n\t"
1803
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1804
        : "%"REG_a
1805
    );
1806
#else
1807
    int i;
1808
    for (i = 0; i < width; i++) {
1809
        dst1[i] = src[2*i+0];
1810
        dst2[i] = src[2*i+1];
1811
    }
1812
#endif
1813
}
1814

    
1815
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1816
                                    const uint8_t *src1, const uint8_t *src2,
1817
                                    long width, uint32_t *unused)
1818
{
1819
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1820
}
1821

    
1822
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1823
                                    const uint8_t *src1, const uint8_t *src2,
1824
                                    long width, uint32_t *unused)
1825
{
1826
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1827
}
1828

    
1829
#if COMPILE_TEMPLATE_MMX
1830
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1831
{
1832

    
1833
    if(srcFormat == PIX_FMT_BGR24) {
1834
        __asm__ volatile(
1835
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1836
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1837
            :
1838
        );
1839
    } else {
1840
        __asm__ volatile(
1841
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1842
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1843
            :
1844
        );
1845
    }
1846

    
1847
    __asm__ volatile(
1848
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1849
        "mov                        %2, %%"REG_a"   \n\t"
1850
        "pxor                    %%mm7, %%mm7       \n\t"
1851
        "1:                                         \n\t"
1852
        PREFETCH"               64(%0)              \n\t"
1853
        "movd                     (%0), %%mm0       \n\t"
1854
        "movd                    2(%0), %%mm1       \n\t"
1855
        "movd                    6(%0), %%mm2       \n\t"
1856
        "movd                    8(%0), %%mm3       \n\t"
1857
        "add                       $12, %0          \n\t"
1858
        "punpcklbw               %%mm7, %%mm0       \n\t"
1859
        "punpcklbw               %%mm7, %%mm1       \n\t"
1860
        "punpcklbw               %%mm7, %%mm2       \n\t"
1861
        "punpcklbw               %%mm7, %%mm3       \n\t"
1862
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1863
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1864
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1865
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1866
        "paddd                   %%mm1, %%mm0       \n\t"
1867
        "paddd                   %%mm3, %%mm2       \n\t"
1868
        "paddd                   %%mm4, %%mm0       \n\t"
1869
        "paddd                   %%mm4, %%mm2       \n\t"
1870
        "psrad                     $15, %%mm0       \n\t"
1871
        "psrad                     $15, %%mm2       \n\t"
1872
        "packssdw                %%mm2, %%mm0       \n\t"
1873
        "packuswb                %%mm0, %%mm0       \n\t"
1874
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1875
        "add                        $4, %%"REG_a"   \n\t"
1876
        " js                        1b              \n\t"
1877
    : "+r" (src)
1878
    : "r" (dst+width), "g" ((x86_reg)-width)
1879
    : "%"REG_a
1880
    );
1881
}
1882

    
1883
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1884
{
1885
    __asm__ volatile(
1886
        "movq                   24(%4), %%mm6       \n\t"
1887
        "mov                        %3, %%"REG_a"   \n\t"
1888
        "pxor                    %%mm7, %%mm7       \n\t"
1889
        "1:                                         \n\t"
1890
        PREFETCH"               64(%0)              \n\t"
1891
        "movd                     (%0), %%mm0       \n\t"
1892
        "movd                    2(%0), %%mm1       \n\t"
1893
        "punpcklbw               %%mm7, %%mm0       \n\t"
1894
        "punpcklbw               %%mm7, %%mm1       \n\t"
1895
        "movq                    %%mm0, %%mm2       \n\t"
1896
        "movq                    %%mm1, %%mm3       \n\t"
1897
        "pmaddwd                  (%4), %%mm0       \n\t"
1898
        "pmaddwd                 8(%4), %%mm1       \n\t"
1899
        "pmaddwd                16(%4), %%mm2       \n\t"
1900
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1901
        "paddd                   %%mm1, %%mm0       \n\t"
1902
        "paddd                   %%mm3, %%mm2       \n\t"
1903

    
1904
        "movd                    6(%0), %%mm1       \n\t"
1905
        "movd                    8(%0), %%mm3       \n\t"
1906
        "add                       $12, %0          \n\t"
1907
        "punpcklbw               %%mm7, %%mm1       \n\t"
1908
        "punpcklbw               %%mm7, %%mm3       \n\t"
1909
        "movq                    %%mm1, %%mm4       \n\t"
1910
        "movq                    %%mm3, %%mm5       \n\t"
1911
        "pmaddwd                  (%4), %%mm1       \n\t"
1912
        "pmaddwd                 8(%4), %%mm3       \n\t"
1913
        "pmaddwd                16(%4), %%mm4       \n\t"
1914
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1915
        "paddd                   %%mm3, %%mm1       \n\t"
1916
        "paddd                   %%mm5, %%mm4       \n\t"
1917

    
1918
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1919
        "paddd                   %%mm3, %%mm0       \n\t"
1920
        "paddd                   %%mm3, %%mm2       \n\t"
1921
        "paddd                   %%mm3, %%mm1       \n\t"
1922
        "paddd                   %%mm3, %%mm4       \n\t"
1923
        "psrad                     $15, %%mm0       \n\t"
1924
        "psrad                     $15, %%mm2       \n\t"
1925
        "psrad                     $15, %%mm1       \n\t"
1926
        "psrad                     $15, %%mm4       \n\t"
1927
        "packssdw                %%mm1, %%mm0       \n\t"
1928
        "packssdw                %%mm4, %%mm2       \n\t"
1929
        "packuswb                %%mm0, %%mm0       \n\t"
1930
        "packuswb                %%mm2, %%mm2       \n\t"
1931
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1932
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1933
        "add                        $4, %%"REG_a"   \n\t"
1934
        " js                        1b              \n\t"
1935
    : "+r" (src)
1936
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "r"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24])
1937
    : "%"REG_a
1938
    );
1939
}
1940
#endif
1941

    
1942
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1943
{
1944
#if COMPILE_TEMPLATE_MMX
1945
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1946
#else
1947
    int i;
1948
    for (i=0; i<width; i++) {
1949
        int b= src[i*3+0];
1950
        int g= src[i*3+1];
1951
        int r= src[i*3+2];
1952

    
1953
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1954
    }
1955
#endif /* COMPILE_TEMPLATE_MMX */
1956
}
1957

    
1958
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1959
{
1960
#if COMPILE_TEMPLATE_MMX
1961
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1962
#else
1963
    int i;
1964
    for (i=0; i<width; i++) {
1965
        int b= src1[3*i + 0];
1966
        int g= src1[3*i + 1];
1967
        int r= src1[3*i + 2];
1968

    
1969
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1970
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1971
    }
1972
#endif /* COMPILE_TEMPLATE_MMX */
1973
    assert(src1 == src2);
1974
}
1975

    
1976
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1977
{
1978
    int i;
1979
    for (i=0; i<width; i++) {
1980
        int b= src1[6*i + 0] + src1[6*i + 3];
1981
        int g= src1[6*i + 1] + src1[6*i + 4];
1982
        int r= src1[6*i + 2] + src1[6*i + 5];
1983

    
1984
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1985
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1986
    }
1987
    assert(src1 == src2);
1988
}
1989

    
1990
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1991
{
1992
#if COMPILE_TEMPLATE_MMX
1993
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1994
#else
1995
    int i;
1996
    for (i=0; i<width; i++) {
1997
        int r= src[i*3+0];
1998
        int g= src[i*3+1];
1999
        int b= src[i*3+2];
2000

    
2001
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2002
    }
2003
#endif
2004
}
2005

    
2006
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2007
{
2008
#if COMPILE_TEMPLATE_MMX
2009
    assert(src1==src2);
2010
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2011
#else
2012
    int i;
2013
    assert(src1==src2);
2014
    for (i=0; i<width; i++) {
2015
        int r= src1[3*i + 0];
2016
        int g= src1[3*i + 1];
2017
        int b= src1[3*i + 2];
2018

    
2019
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2020
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2021
    }
2022
#endif
2023
}
2024

    
2025
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2026
{
2027
    int i;
2028
    assert(src1==src2);
2029
    for (i=0; i<width; i++) {
2030
        int r= src1[6*i + 0] + src1[6*i + 3];
2031
        int g= src1[6*i + 1] + src1[6*i + 4];
2032
        int b= src1[6*i + 2] + src1[6*i + 5];
2033

    
2034
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2035
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2036
    }
2037
}
2038

    
2039

    
2040
// bilinear / bicubic scaling
2041
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2042
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2043
{
2044
#if COMPILE_TEMPLATE_MMX
2045
    assert(filterSize % 4 == 0 && filterSize>0);
2046
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2047
        x86_reg counter= -2*dstW;
2048
        filter-= counter*2;
2049
        filterPos-= counter/2;
2050
        dst-= counter/2;
2051
        __asm__ volatile(
2052
#if defined(PIC)
2053
            "push            %%"REG_b"              \n\t"
2054
#endif
2055
            "pxor                %%mm7, %%mm7       \n\t"
2056
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2057
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2058
            ".p2align                4              \n\t"
2059
            "1:                                     \n\t"
2060
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2061
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2062
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2063
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2064
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2065
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2066
            "punpcklbw           %%mm7, %%mm0       \n\t"
2067
            "punpcklbw           %%mm7, %%mm2       \n\t"
2068
            "pmaddwd             %%mm1, %%mm0       \n\t"
2069
            "pmaddwd             %%mm2, %%mm3       \n\t"
2070
            "movq                %%mm0, %%mm4       \n\t"
2071
            "punpckldq           %%mm3, %%mm0       \n\t"
2072
            "punpckhdq           %%mm3, %%mm4       \n\t"
2073
            "paddd               %%mm4, %%mm0       \n\t"
2074
            "psrad                  $7, %%mm0       \n\t"
2075
            "packssdw            %%mm0, %%mm0       \n\t"
2076
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2077
            "add                    $4, %%"REG_BP"  \n\t"
2078
            " jnc                   1b              \n\t"
2079

    
2080
            "pop            %%"REG_BP"              \n\t"
2081
#if defined(PIC)
2082
            "pop             %%"REG_b"              \n\t"
2083
#endif
2084
            : "+a" (counter)
2085
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2086
#if !defined(PIC)
2087
            : "%"REG_b
2088
#endif
2089
        );
2090
    } else if (filterSize==8) {
2091
        x86_reg counter= -2*dstW;
2092
        filter-= counter*4;
2093
        filterPos-= counter/2;
2094
        dst-= counter/2;
2095
        __asm__ volatile(
2096
#if defined(PIC)
2097
            "push             %%"REG_b"             \n\t"
2098
#endif
2099
            "pxor                 %%mm7, %%mm7      \n\t"
2100
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2101
            "mov              %%"REG_a", %%"REG_BP" \n\t"
2102
            ".p2align                 4             \n\t"
2103
            "1:                                     \n\t"
2104
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2105
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2106
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2107
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2108
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2109
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2110
            "punpcklbw            %%mm7, %%mm0      \n\t"
2111
            "punpcklbw            %%mm7, %%mm2      \n\t"
2112
            "pmaddwd              %%mm1, %%mm0      \n\t"
2113
            "pmaddwd              %%mm2, %%mm3      \n\t"
2114

    
2115
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2116
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2117
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2118
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2119
            "punpcklbw            %%mm7, %%mm4      \n\t"
2120
            "punpcklbw            %%mm7, %%mm2      \n\t"
2121
            "pmaddwd              %%mm1, %%mm4      \n\t"
2122
            "pmaddwd              %%mm2, %%mm5      \n\t"
2123
            "paddd                %%mm4, %%mm0      \n\t"
2124
            "paddd                %%mm5, %%mm3      \n\t"
2125
            "movq                 %%mm0, %%mm4      \n\t"
2126
            "punpckldq            %%mm3, %%mm0      \n\t"
2127
            "punpckhdq            %%mm3, %%mm4      \n\t"
2128
            "paddd                %%mm4, %%mm0      \n\t"
2129
            "psrad                   $7, %%mm0      \n\t"
2130
            "packssdw             %%mm0, %%mm0      \n\t"
2131
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2132
            "add                     $4, %%"REG_BP" \n\t"
2133
            " jnc                    1b             \n\t"
2134

    
2135
            "pop             %%"REG_BP"             \n\t"
2136
#if defined(PIC)
2137
            "pop              %%"REG_b"             \n\t"
2138
#endif
2139
            : "+a" (counter)
2140
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2141
#if !defined(PIC)
2142
            : "%"REG_b
2143
#endif
2144
        );
2145
    } else {
2146
        const uint8_t *offset = src+filterSize;
2147
        x86_reg counter= -2*dstW;
2148
        //filter-= counter*filterSize/2;
2149
        filterPos-= counter/2;
2150
        dst-= counter/2;
2151
        __asm__ volatile(
2152
            "pxor                  %%mm7, %%mm7     \n\t"
2153
            ".p2align                  4            \n\t"
2154
            "1:                                     \n\t"
2155
            "mov                      %2, %%"REG_c" \n\t"
2156
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2157
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2158
            "mov                      %5, %%"REG_c" \n\t"
2159
            "pxor                  %%mm4, %%mm4     \n\t"
2160
            "pxor                  %%mm5, %%mm5     \n\t"
2161
            "2:                                     \n\t"
2162
            "movq                   (%1), %%mm1     \n\t"
2163
            "movq               (%1, %6), %%mm3     \n\t"
2164
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2165
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2166
            "punpcklbw             %%mm7, %%mm0     \n\t"
2167
            "punpcklbw             %%mm7, %%mm2     \n\t"
2168
            "pmaddwd               %%mm1, %%mm0     \n\t"
2169
            "pmaddwd               %%mm2, %%mm3     \n\t"
2170
            "paddd                 %%mm3, %%mm5     \n\t"
2171
            "paddd                 %%mm0, %%mm4     \n\t"
2172
            "add                      $8, %1        \n\t"
2173
            "add                      $4, %%"REG_c" \n\t"
2174
            "cmp                      %4, %%"REG_c" \n\t"
2175
            " jb                      2b            \n\t"
2176
            "add                      %6, %1        \n\t"
2177
            "movq                  %%mm4, %%mm0     \n\t"
2178
            "punpckldq             %%mm5, %%mm4     \n\t"
2179
            "punpckhdq             %%mm5, %%mm0     \n\t"
2180
            "paddd                 %%mm0, %%mm4     \n\t"
2181
            "psrad                    $7, %%mm4     \n\t"
2182
            "packssdw              %%mm4, %%mm4     \n\t"
2183
            "mov                      %3, %%"REG_a" \n\t"
2184
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2185
            "add                      $4, %0        \n\t"
2186
            " jnc                     1b            \n\t"
2187

    
2188
            : "+r" (counter), "+r" (filter)
2189
            : "m" (filterPos), "m" (dst), "m"(offset),
2190
            "m" (src), "r" ((x86_reg)filterSize*2)
2191
            : "%"REG_a, "%"REG_c, "%"REG_d
2192
        );
2193
    }
2194
#else
2195
#if COMPILE_TEMPLATE_ALTIVEC
2196
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2197
#else
2198
    int i;
2199
    for (i=0; i<dstW; i++) {
2200
        int j;
2201
        int srcPos= filterPos[i];
2202
        int val=0;
2203
        //printf("filterPos: %d\n", filterPos[i]);
2204
        for (j=0; j<filterSize; j++) {
2205
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2206
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2207
        }
2208
        //filter += hFilterSize;
2209
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2210
        //dst[i] = val>>7;
2211
    }
2212
#endif /* COMPILE_TEMPLATE_ALTIVEC */
2213
#endif /* COMPILE_MMX */
2214
}
2215

    
2216
//FIXME all pal and rgb srcFormats could do this convertion as well
2217
//FIXME all scalers more complex than bilinear could do half of this transform
2218
static void RENAME(chrRangeToJpeg)(int16_t *dst, int width)
2219
{
2220
    int i;
2221
    for (i = 0; i < width; i++) {
2222
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2223
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2224
    }
2225
}
2226
static void RENAME(chrRangeFromJpeg)(int16_t *dst, int width)
2227
{
2228
    int i;
2229
    for (i = 0; i < width; i++) {
2230
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2231
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2232
    }
2233
}
2234
static void RENAME(lumRangeToJpeg)(int16_t *dst, int width)
2235
{
2236
    int i;
2237
    for (i = 0; i < width; i++)
2238
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2239
}
2240
static void RENAME(lumRangeFromJpeg)(int16_t *dst, int width)
2241
{
2242
    int i;
2243
    for (i = 0; i < width; i++)
2244
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2245
}
2246

    
2247
#define FAST_BILINEAR_X86 \
2248
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2249
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2250
    "shll      $16, %%edi    \n\t"                                              \
2251
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2252
    "mov        %1, %%"REG_D"\n\t"                                              \
2253
    "shrl       $9, %%esi    \n\t"                                              \
2254

    
2255
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2256
                                        long dstWidth, const uint8_t *src, int srcW,
2257
                                        int xInc)
2258
{
2259
#if ARCH_X86
2260
#if COMPILE_TEMPLATE_MMX2
2261
    int32_t *filterPos = c->hLumFilterPos;
2262
    int16_t *filter    = c->hLumFilter;
2263
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2264
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2265
    int i;
2266
#if defined(PIC)
2267
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2268
#endif
2269
    if (canMMX2BeUsed) {
2270
        __asm__ volatile(
2271
#if defined(PIC)
2272
            "mov               %%"REG_b", %5        \n\t"
2273
#endif
2274
            "pxor                  %%mm7, %%mm7     \n\t"
2275
            "mov                      %0, %%"REG_c" \n\t"
2276
            "mov                      %1, %%"REG_D" \n\t"
2277
            "mov                      %2, %%"REG_d" \n\t"
2278
            "mov                      %3, %%"REG_b" \n\t"
2279
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2280
            PREFETCH"        (%%"REG_c")            \n\t"
2281
            PREFETCH"      32(%%"REG_c")            \n\t"
2282
            PREFETCH"      64(%%"REG_c")            \n\t"
2283

    
2284
#if ARCH_X86_64
2285

    
2286
#define CALL_MMX2_FILTER_CODE \
2287
            "movl            (%%"REG_b"), %%esi     \n\t"\
2288
            "call                    *%4            \n\t"\
2289
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2290
            "add               %%"REG_S", %%"REG_c" \n\t"\
2291
            "add               %%"REG_a", %%"REG_D" \n\t"\
2292
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2293

    
2294
#else
2295

    
2296
#define CALL_MMX2_FILTER_CODE \
2297
            "movl (%%"REG_b"), %%esi        \n\t"\
2298
            "call         *%4                       \n\t"\
2299
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2300
            "add               %%"REG_a", %%"REG_D" \n\t"\
2301
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2302

    
2303
#endif /* ARCH_X86_64 */
2304

    
2305
            CALL_MMX2_FILTER_CODE
2306
            CALL_MMX2_FILTER_CODE
2307
            CALL_MMX2_FILTER_CODE
2308
            CALL_MMX2_FILTER_CODE
2309
            CALL_MMX2_FILTER_CODE
2310
            CALL_MMX2_FILTER_CODE
2311
            CALL_MMX2_FILTER_CODE
2312
            CALL_MMX2_FILTER_CODE
2313

    
2314
#if defined(PIC)
2315
            "mov                      %5, %%"REG_b" \n\t"
2316
#endif
2317
            :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2318
            "m" (mmx2FilterCode)
2319
#if defined(PIC)
2320
            ,"m" (ebxsave)
2321
#endif
2322
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2323
#if !defined(PIC)
2324
            ,"%"REG_b
2325
#endif
2326
        );
2327
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2328
    } else {
2329
#endif /* COMPILE_TEMPLATE_MMX2 */
2330
    x86_reg xInc_shr16 = xInc >> 16;
2331
    uint16_t xInc_mask = xInc & 0xffff;
2332
    x86_reg dstWidth_reg = dstWidth;
2333
    //NO MMX just normal asm ...
2334
    __asm__ volatile(
2335
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2336
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2337
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2338
        ".p2align    4                       \n\t"
2339
        "1:                                  \n\t"
2340
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2341
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2342
        FAST_BILINEAR_X86
2343
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2344
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2345
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2346

    
2347
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2348
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2349
        FAST_BILINEAR_X86
2350
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2351
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2352
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2353

    
2354

    
2355
        "add        $2, %%"REG_a"            \n\t"
2356
        "cmp        %2, %%"REG_a"            \n\t"
2357
        " jb        1b                       \n\t"
2358

    
2359

    
2360
        :: "r" (src), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask)
2361
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2362
    );
2363
#if COMPILE_TEMPLATE_MMX2
2364
    } //if MMX2 can't be used
2365
#endif
2366
#else
2367
    int i;
2368
    unsigned int xpos=0;
2369
    for (i=0;i<dstWidth;i++) {
2370
        register unsigned int xx=xpos>>16;
2371
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2372
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2373
        xpos+=xInc;
2374
    }
2375
#endif /* ARCH_X86 */
2376
}
2377

    
2378
      // *** horizontal scale Y line to temp buffer
2379
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2380
                                   const int16_t *hLumFilter,
2381
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2382
                                   uint8_t *formatConvBuffer,
2383
                                   uint32_t *pal, int isAlpha)
2384
{
2385
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2386
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2387

    
2388
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2389

    
2390
    if (toYV12) {
2391
        toYV12(formatConvBuffer, src, srcW, pal);
2392
        src= formatConvBuffer;
2393
    }
2394

    
2395
    if (!c->hyscale_fast) {
2396
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2397
    } else { // fast bilinear upscale / crap downscale
2398
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2399
    }
2400

    
2401
    if (convertRange)
2402
        convertRange(dst, dstWidth);
2403
}
2404

    
2405
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2406
                                        long dstWidth, const uint8_t *src1,
2407
                                        const uint8_t *src2, int srcW, int xInc)
2408
{
2409
#if ARCH_X86
2410
#if COMPILE_TEMPLATE_MMX2
2411
    int32_t *filterPos = c->hChrFilterPos;
2412
    int16_t *filter    = c->hChrFilter;
2413
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2414
    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2415
    int i;
2416
#if defined(PIC)
2417
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2418
#endif
2419
    if (canMMX2BeUsed) {
2420
        __asm__ volatile(
2421
#if defined(PIC)
2422
            "mov          %%"REG_b", %6         \n\t"
2423
#endif
2424
            "pxor             %%mm7, %%mm7      \n\t"
2425
            "mov                 %0, %%"REG_c"  \n\t"
2426
            "mov                 %1, %%"REG_D"  \n\t"
2427
            "mov                 %2, %%"REG_d"  \n\t"
2428
            "mov                 %3, %%"REG_b"  \n\t"
2429
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2430
            PREFETCH"   (%%"REG_c")             \n\t"
2431
            PREFETCH" 32(%%"REG_c")             \n\t"
2432
            PREFETCH" 64(%%"REG_c")             \n\t"
2433

    
2434
            CALL_MMX2_FILTER_CODE
2435
            CALL_MMX2_FILTER_CODE
2436
            CALL_MMX2_FILTER_CODE
2437
            CALL_MMX2_FILTER_CODE
2438
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2439
            "mov                 %5, %%"REG_c"  \n\t" // src
2440
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2441
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2442
            PREFETCH"   (%%"REG_c")             \n\t"
2443
            PREFETCH" 32(%%"REG_c")             \n\t"
2444
            PREFETCH" 64(%%"REG_c")             \n\t"
2445

    
2446
            CALL_MMX2_FILTER_CODE
2447
            CALL_MMX2_FILTER_CODE
2448
            CALL_MMX2_FILTER_CODE
2449
            CALL_MMX2_FILTER_CODE
2450

    
2451
#if defined(PIC)
2452
            "mov %6, %%"REG_b"    \n\t"
2453
#endif
2454
            :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2455
            "m" (mmx2FilterCode), "m" (src2)
2456
#if defined(PIC)
2457
            ,"m" (ebxsave)
2458
#endif
2459
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2460
#if !defined(PIC)
2461
            ,"%"REG_b
2462
#endif
2463
        );
2464
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2465
            //printf("%d %d %d\n", dstWidth, i, srcW);
2466
            dst[i] = src1[srcW-1]*128;
2467
            dst[i+VOFW] = src2[srcW-1]*128;
2468
        }
2469
    } else {
2470
#endif /* COMPILE_TEMPLATE_MMX2 */
2471
        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2472
        uint16_t xInc_mask = xInc & 0xffff;
2473
        x86_reg dstWidth_reg = dstWidth;
2474
        __asm__ volatile(
2475
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2476
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2477
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2478
            ".p2align    4                          \n\t"
2479
            "1:                                     \n\t"
2480
            "mov        %0, %%"REG_S"               \n\t"
2481
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2482
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2483
            FAST_BILINEAR_X86
2484
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2485

    
2486
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2487
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2488
            FAST_BILINEAR_X86
2489
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2490

    
2491
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2492
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2493
            "add        $1, %%"REG_a"               \n\t"
2494
            "cmp        %2, %%"REG_a"               \n\t"
2495
            " jb        1b                          \n\t"
2496

    
2497
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498
which is needed to support GCC 4.0. */
2499
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500
            :: "m" (src1), "m" (dst), "g" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2501
#else
2502
            :: "m" (src1), "m" (dst), "m" (dstWidth_reg), "m" (xInc_shr16), "m" (xInc_mask),
2503
#endif
2504
            "r" (src2)
2505
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2506
        );
2507
#if COMPILE_TEMPLATE_MMX2
2508
    } //if MMX2 can't be used
2509
#endif
2510
#else
2511
    int i;
2512
    unsigned int xpos=0;
2513
    for (i=0;i<dstWidth;i++) {
2514
        register unsigned int xx=xpos>>16;
2515
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518
        /* slower
2519
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2521
        */
2522
        xpos+=xInc;
2523
    }
2524
#endif /* ARCH_X86 */
2525
}
2526

    
2527
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2528
                                   int srcW, int xInc, const int16_t *hChrFilter,
2529
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2530
                                   uint8_t *formatConvBuffer,
2531
                                   uint32_t *pal)
2532
{
2533

    
2534
    src1 += c->chrSrcOffset;
2535
    src2 += c->chrSrcOffset;
2536

    
2537
    if (c->chrToYV12) {
2538
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2539
        src1= formatConvBuffer;
2540
        src2= formatConvBuffer+VOFW;
2541
    }
2542

    
2543
    if (!c->hcscale_fast) {
2544
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2546
    } else { // fast bilinear upscale / crap downscale
2547
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2548
    }
2549

    
2550
    if (c->chrConvertRange)
2551
        c->chrConvertRange(dst, dstWidth);
2552
}
2553

    
2554
#define DEBUG_SWSCALE_BUFFERS 0
2555
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2556

    
2557
static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2558
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2559
{
2560
    /* load a few things into local vars to make the code more readable? and faster */
2561
    const int srcW= c->srcW;
2562
    const int dstW= c->dstW;
2563
    const int dstH= c->dstH;
2564
    const int chrDstW= c->chrDstW;
2565
    const int chrSrcW= c->chrSrcW;
2566
    const int lumXInc= c->lumXInc;
2567
    const int chrXInc= c->chrXInc;
2568
    const enum PixelFormat dstFormat= c->dstFormat;
2569
    const int flags= c->flags;
2570
    int16_t *vLumFilterPos= c->vLumFilterPos;
2571
    int16_t *vChrFilterPos= c->vChrFilterPos;
2572
    int16_t *hLumFilterPos= c->hLumFilterPos;
2573
    int16_t *hChrFilterPos= c->hChrFilterPos;
2574
    int16_t *vLumFilter= c->vLumFilter;
2575
    int16_t *vChrFilter= c->vChrFilter;
2576
    int16_t *hLumFilter= c->hLumFilter;
2577
    int16_t *hChrFilter= c->hChrFilter;
2578
    int32_t *lumMmxFilter= c->lumMmxFilter;
2579
    int32_t *chrMmxFilter= c->chrMmxFilter;
2580
    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2581
    const int vLumFilterSize= c->vLumFilterSize;
2582
    const int vChrFilterSize= c->vChrFilterSize;
2583
    const int hLumFilterSize= c->hLumFilterSize;
2584
    const int hChrFilterSize= c->hChrFilterSize;
2585
    int16_t **lumPixBuf= c->lumPixBuf;
2586
    int16_t **chrPixBuf= c->chrPixBuf;
2587
    int16_t **alpPixBuf= c->alpPixBuf;
2588
    const int vLumBufSize= c->vLumBufSize;
2589
    const int vChrBufSize= c->vChrBufSize;
2590
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2591
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593
    int lastDstY;
2594
    uint32_t *pal=c->pal_yuv;
2595

    
2596
    /* vars which will change and which we need to store back in the context */
2597
    int dstY= c->dstY;
2598
    int lumBufIndex= c->lumBufIndex;
2599
    int chrBufIndex= c->chrBufIndex;
2600
    int lastInLumBuf= c->lastInLumBuf;
2601
    int lastInChrBuf= c->lastInChrBuf;
2602

    
2603
    if (isPacked(c->srcFormat)) {
2604
        src[0]=
2605
        src[1]=
2606
        src[2]=
2607
        src[3]= src[0];
2608
        srcStride[0]=
2609
        srcStride[1]=
2610
        srcStride[2]=
2611
        srcStride[3]= srcStride[0];
2612
    }
2613
    srcStride[1]<<= c->vChrDrop;
2614
    srcStride[2]<<= c->vChrDrop;
2615

    
2616
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2621
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2623

    
2624
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2625
        static int warnedAlready=0; //FIXME move this into the context perhaps
2626
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2627
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2628
                   "         ->cannot do aligned memory accesses anymore\n");
2629
            warnedAlready=1;
2630
        }
2631
    }
2632

    
2633
    /* Note the user might start scaling the picture in the middle so this
2634
       will not get executed. This is not really intended but works
2635
       currently, so people might do it. */
2636
    if (srcSliceY ==0) {
2637
        lumBufIndex=-1;
2638
        chrBufIndex=-1;
2639
        dstY=0;
2640
        lastInLumBuf= -1;
2641
        lastInChrBuf= -1;
2642
    }
2643

    
2644
    lastDstY= dstY;
2645

    
2646
    for (;dstY < dstH; dstY++) {
2647
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648
        const int chrDstY= dstY>>c->chrDstVSubSample;
2649
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2651
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2652

    
2653
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654
        const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2655
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2656
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2657
        int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2658
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2659
        int enough_lines;
2660

    
2661
        //handle holes (FAST_BILINEAR & weird filters)
2662
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2663
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2664
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2665
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2666

    
2667
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2668
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2669
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2670
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2671
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2672

    
2673
        // Do we have enough lines in this slice to output the dstY line
2674
        enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2675

    
2676
        if (!enough_lines) {
2677
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2678
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2679
            DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2680
                                            lastLumSrcY, lastChrSrcY);
2681
        }
2682

    
2683
        //Do horizontal scaling
2684
        while(lastInLumBuf < lastLumSrcY) {
2685
            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2686
            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2687
            lumBufIndex++;
2688
            assert(lumBufIndex < 2*vLumBufSize);
2689
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2690
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2691
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2692
                            hLumFilter, hLumFilterPos, hLumFilterSize,
2693
                            formatConvBuffer,
2694
                            pal, 0);
2695
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2696
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2697
                                hLumFilter, hLumFilterPos, hLumFilterSize,
2698
                                formatConvBuffer,
2699
                                pal, 1);
2700
            lastInLumBuf++;
2701
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2702
                               lumBufIndex,    lastInLumBuf);
2703
        }
2704
        while(lastInChrBuf < lastChrSrcY) {
2705
            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2706
            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2707
            chrBufIndex++;
2708
            assert(chrBufIndex < 2*vChrBufSize);
2709
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2710
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2711
            //FIXME replace parameters through context struct (some at least)
2712

    
2713
            if (c->needs_hcscale)
2714
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2715
                                hChrFilter, hChrFilterPos, hChrFilterSize,
2716
                                formatConvBuffer,
2717
                                pal);
2718
            lastInChrBuf++;
2719
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2720
                               chrBufIndex,    lastInChrBuf);
2721
        }
2722
        //wrap buf index around to stay inside the ring buffer
2723
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2724
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2725
        if (!enough_lines)
2726
            break; //we can't output a dstY line so let's try with the next slice
2727

    
2728
#if COMPILE_TEMPLATE_MMX
2729
        c->blueDither= ff_dither8[dstY&1];
2730
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2731
            c->greenDither= ff_dither8[dstY&1];
2732
        else
2733
            c->greenDither= ff_dither4[dstY&1];
2734
        c->redDither= ff_dither8[(dstY+1)&1];
2735
#endif
2736
        if (dstY < dstH-2) {
2737
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2738
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2739
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2740
#if COMPILE_TEMPLATE_MMX
2741
            int i;
2742
            if (flags & SWS_ACCURATE_RND) {
2743
                int s= APCK_SIZE / 8;
2744
                for (i=0; i<vLumFilterSize; i+=2) {
2745
                    *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2746
                    *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2747
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2748
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2749
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2750
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2751
                        *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2752
                        *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2753
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2754
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2755
                    }
2756
                }
2757
                for (i=0; i<vChrFilterSize; i+=2) {
2758
                    *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2759
                    *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2760
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2761
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2762
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2763
                }
2764
            } else {
2765
                for (i=0; i<vLumFilterSize; i++) {
2766
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2767
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2768
                    lumMmxFilter[4*i+2]=
2769
                    lumMmxFilter[4*i+3]=
2770
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2771
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2772
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2773
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2774
                        alpMmxFilter[4*i+2]=
2775
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2776
                    }
2777
                }
2778
                for (i=0; i<vChrFilterSize; i++) {
2779
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2780
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2781
                    chrMmxFilter[4*i+2]=
2782
                    chrMmxFilter[4*i+3]=
2783
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2784
                }
2785
            }
2786
#endif
2787
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2788
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2789
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2790
                c->yuv2nv12X(c,
2791
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2792
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2793
                             dest, uDest, dstW, chrDstW, dstFormat);
2794
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2795
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2796
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2797
                if (is16BPS(dstFormat)) {
2798
                    yuv2yuvX16inC(
2799
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2800
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2801
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2802
                                  dstFormat);
2803
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2804
                    const int16_t *lumBuf = lumSrcPtr[0];
2805
                    const int16_t *chrBuf= chrSrcPtr[0];
2806
                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2807
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2808
                } else { //General YV12
2809
                    c->yuv2yuvX(c,
2810
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2811
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2812
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2813
                }
2814
            } else {
2815
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2816
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2817
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2818
                    int chrAlpha= vChrFilter[2*dstY+1];
2819
                    if(flags & SWS_FULL_CHR_H_INT) {
2820
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2821
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2822
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2823
                                         alpSrcPtr, dest, dstW, dstY);
2824
                    } else {
2825
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2826
                                       alpPixBuf ? *alpSrcPtr : NULL,
2827
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2828
                    }
2829
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2830
                    int lumAlpha= vLumFilter[2*dstY+1];
2831
                    int chrAlpha= vChrFilter[2*dstY+1];
2832
                    lumMmxFilter[2]=
2833
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2834
                    chrMmxFilter[2]=
2835
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2836
                    if(flags & SWS_FULL_CHR_H_INT) {
2837
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2838
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2839
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2840
                                         alpSrcPtr, dest, dstW, dstY);
2841
                    } else {
2842
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2843
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2844
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2845
                    }
2846
                } else { //general RGB
2847
                    if(flags & SWS_FULL_CHR_H_INT) {
2848
                        yuv2rgbXinC_full(c,
2849
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851
                                         alpSrcPtr, dest, dstW, dstY);
2852
                    } else {
2853
                        c->yuv2packedX(c,
2854
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2855
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2856
                                       alpSrcPtr, dest, dstW, dstY);
2857
                    }
2858
                }
2859
            }
2860
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2861
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2862
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2863
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2864
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2865
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2866
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2867
                yuv2nv12XinC(
2868
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2869
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2870
                             dest, uDest, dstW, chrDstW, dstFormat);
2871
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2872
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2873
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2874
                if (is16BPS(dstFormat)) {
2875
                    yuv2yuvX16inC(
2876
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2877
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2878
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2879
                                  dstFormat);
2880
                } else {
2881
                    yuv2yuvXinC(
2882
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2883
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2884
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2885
                }
2886
            } else {
2887
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2888
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2889
                if(flags & SWS_FULL_CHR_H_INT) {
2890
                    yuv2rgbXinC_full(c,
2891
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893
                                     alpSrcPtr, dest, dstW, dstY);
2894
                } else {
2895
                    yuv2packedXinC(c,
2896
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2897
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2898
                                   alpSrcPtr, dest, dstW, dstY);
2899
                }
2900
            }
2901
        }
2902
    }
2903

    
2904
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2905
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2906

    
2907
#if COMPILE_TEMPLATE_MMX
2908
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2909
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2910
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2911
    else                             __asm__ volatile("emms"  :::"memory");
2912
#endif
2913
    /* store changed local vars back in the context */
2914
    c->dstY= dstY;
2915
    c->lumBufIndex= lumBufIndex;
2916
    c->chrBufIndex= chrBufIndex;
2917
    c->lastInLumBuf= lastInLumBuf;
2918
    c->lastInChrBuf= lastInChrBuf;
2919

    
2920
    return dstY - lastDstY;
2921
}
2922

    
2923
static void RENAME(sws_init_swScale)(SwsContext *c)
2924
{
2925
    enum PixelFormat srcFormat = c->srcFormat;
2926

    
2927
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2928
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2929
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2930
    c->yuv2packed1  = RENAME(yuv2packed1 );
2931
    c->yuv2packed2  = RENAME(yuv2packed2 );
2932
    c->yuv2packedX  = RENAME(yuv2packedX );
2933

    
2934
    c->hScale       = RENAME(hScale      );
2935

    
2936
#if COMPILE_TEMPLATE_MMX
2937
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2938
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2939
#else
2940
    if (c->flags & SWS_FAST_BILINEAR)
2941
#endif
2942
    {
2943
        c->hyscale_fast = RENAME(hyscale_fast);
2944
        c->hcscale_fast = RENAME(hcscale_fast);
2945
    }
2946

    
2947
    c->chrToYV12 = NULL;
2948
    switch(srcFormat) {
2949
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2950
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2951
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2952
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2953
        case PIX_FMT_RGB8     :
2954
        case PIX_FMT_BGR8     :
2955
        case PIX_FMT_PAL8     :
2956
        case PIX_FMT_BGR4_BYTE:
2957
        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2958
        case PIX_FMT_YUV420P16BE:
2959
        case PIX_FMT_YUV422P16BE:
2960
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2961
        case PIX_FMT_YUV420P16LE:
2962
        case PIX_FMT_YUV422P16LE:
2963
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2964
    }
2965
    if (c->chrSrcHSubSample) {
2966
        switch(srcFormat) {
2967
        case PIX_FMT_RGB48BE:
2968
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2969
        case PIX_FMT_BGR48BE:
2970
        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV_half; break;
2971
        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV_half;  break;
2972
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV_half; break;
2973
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2974
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2975
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2976
        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV_half;  break;
2977
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV_half; break;
2978
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2979
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2980
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2981
        }
2982
    } else {
2983
        switch(srcFormat) {
2984
        case PIX_FMT_RGB48BE:
2985
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2986
        case PIX_FMT_BGR48BE:
2987
        case PIX_FMT_BGR48LE: c->chrToYV12 = bgr48ToUV; break;
2988
        case PIX_FMT_RGB32  : c->chrToYV12 = bgr32ToUV;  break;
2989
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr321ToUV; break;
2990
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2991
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2992
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2993
        case PIX_FMT_BGR32  : c->chrToYV12 = rgb32ToUV;  break;
2994
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb321ToUV; break;
2995
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
2996
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2997
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2998
        }
2999
    }
3000

    
3001
    c->lumToYV12 = NULL;
3002
    c->alpToYV12 = NULL;
3003
    switch (srcFormat) {
3004
    case PIX_FMT_YUYV422  :
3005
    case PIX_FMT_YUV420P16BE:
3006
    case PIX_FMT_YUV422P16BE:
3007
    case PIX_FMT_YUV444P16BE:
3008
    case PIX_FMT_Y400A    :
3009
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3010
    case PIX_FMT_UYVY422  :
3011
    case PIX_FMT_YUV420P16LE:
3012
    case PIX_FMT_YUV422P16LE:
3013
    case PIX_FMT_YUV444P16LE:
3014
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3015
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
3016
    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
3017
    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
3018
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
3019
    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
3020
    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
3021
    case PIX_FMT_RGB8     :
3022
    case PIX_FMT_BGR8     :
3023
    case PIX_FMT_PAL8     :
3024
    case PIX_FMT_BGR4_BYTE:
3025
    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3026
    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3027
    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3028
    case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY;  break;
3029
    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY; break;
3030
    case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY;  break;
3031
    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY; break;
3032
    case PIX_FMT_RGB48BE:
3033
    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3034
    case PIX_FMT_BGR48BE:
3035
    case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48ToY; break;
3036
    }
3037
    if (c->alpPixBuf) {
3038
        switch (srcFormat) {
3039
        case PIX_FMT_RGB32  :
3040
        case PIX_FMT_RGB32_1:
3041
        case PIX_FMT_BGR32  :
3042
        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3043
        case PIX_FMT_Y400A  : c->alpToYV12 = RENAME(yuy2ToY); break;
3044
        }
3045
    }
3046

    
3047
    switch (srcFormat) {
3048
    case PIX_FMT_Y400A  :
3049
        c->alpSrcOffset = 1;
3050
        break;
3051
    case PIX_FMT_RGB32  :
3052
    case PIX_FMT_BGR32  :
3053
        c->alpSrcOffset = 3;
3054
        break;
3055
    case PIX_FMT_RGB48LE:
3056
    case PIX_FMT_BGR48LE:
3057
        c->lumSrcOffset = 1;
3058
        c->chrSrcOffset = 1;
3059
        c->alpSrcOffset = 1;
3060
        break;
3061
    }
3062

    
3063
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3064
        if (c->srcRange) {
3065
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
3066
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
3067
        } else {
3068
            c->lumConvertRange = RENAME(lumRangeToJpeg);
3069
            c->chrConvertRange = RENAME(chrRangeToJpeg);
3070
        }
3071
    }
3072

    
3073
    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3074
          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3075
        c->needs_hcscale = 1;
3076
}