Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 22b6a24c

History | View | Annotate | Download (137 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28

    
29
#if COMPILE_TEMPLATE_AMD3DNOW
30
#define PREFETCH  "prefetch"
31
#elif COMPILE_TEMPLATE_MMX2
32
#define PREFETCH "prefetchnta"
33
#else
34
#define PREFETCH  " # nop"
35
#endif
36

    
37
#if COMPILE_TEMPLATE_MMX2
38
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39
#elif COMPILE_TEMPLATE_AMD3DNOW
40
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41
#endif
42

    
43
#if COMPILE_TEMPLATE_MMX2
44
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45
#else
46
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47
#endif
48
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
49

    
50
#if COMPILE_TEMPLATE_ALTIVEC
51
#include "ppc/swscale_altivec_template.c"
52
#endif
53

    
54
#define YSCALEYUV2YV12X(x, offset, dest, width) \
55
    __asm__ volatile(\
56
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
57
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
58
        "movq                             %%mm3, %%mm4      \n\t"\
59
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
60
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
61
        ASMALIGN(4) /* FIXME Unroll? */\
62
        "1:                                                 \n\t"\
63
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
64
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
65
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
66
        "add                                $16, %%"REG_d"  \n\t"\
67
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
68
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
69
        "pmulhw                           %%mm0, %%mm2      \n\t"\
70
        "pmulhw                           %%mm0, %%mm5      \n\t"\
71
        "paddw                            %%mm2, %%mm3      \n\t"\
72
        "paddw                            %%mm5, %%mm4      \n\t"\
73
        " jnz                                1b             \n\t"\
74
        "psraw                               $3, %%mm3      \n\t"\
75
        "psraw                               $3, %%mm4      \n\t"\
76
        "packuswb                         %%mm4, %%mm3      \n\t"\
77
        MOVNTQ(%%mm3, (%1, %%REGa))\
78
        "add                                 $8, %%"REG_a"  \n\t"\
79
        "cmp                                 %2, %%"REG_a"  \n\t"\
80
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
81
        "movq                             %%mm3, %%mm4      \n\t"\
82
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
83
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
84
        "jb                                  1b             \n\t"\
85
        :: "r" (&c->redDither),\
86
        "r" (dest), "g" (width)\
87
        : "%"REG_a, "%"REG_d, "%"REG_S\
88
    );
89

    
90
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
91
    __asm__ volatile(\
92
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
93
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
94
        "pxor                             %%mm4, %%mm4      \n\t"\
95
        "pxor                             %%mm5, %%mm5      \n\t"\
96
        "pxor                             %%mm6, %%mm6      \n\t"\
97
        "pxor                             %%mm7, %%mm7      \n\t"\
98
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
99
        ASMALIGN(4) \
100
        "1:                                                 \n\t"\
101
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
102
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
103
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
104
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
105
        "movq                             %%mm0, %%mm3      \n\t"\
106
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
107
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
108
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
109
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
110
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
111
        "paddd                            %%mm0, %%mm4      \n\t"\
112
        "paddd                            %%mm3, %%mm5      \n\t"\
113
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
114
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
115
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
116
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
117
        "movq                             %%mm2, %%mm0      \n\t"\
118
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
119
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
120
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
121
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
122
        "paddd                            %%mm2, %%mm6      \n\t"\
123
        "paddd                            %%mm0, %%mm7      \n\t"\
124
        " jnz                                1b             \n\t"\
125
        "psrad                              $16, %%mm4      \n\t"\
126
        "psrad                              $16, %%mm5      \n\t"\
127
        "psrad                              $16, %%mm6      \n\t"\
128
        "psrad                              $16, %%mm7      \n\t"\
129
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
130
        "packssdw                         %%mm5, %%mm4      \n\t"\
131
        "packssdw                         %%mm7, %%mm6      \n\t"\
132
        "paddw                            %%mm0, %%mm4      \n\t"\
133
        "paddw                            %%mm0, %%mm6      \n\t"\
134
        "psraw                               $3, %%mm4      \n\t"\
135
        "psraw                               $3, %%mm6      \n\t"\
136
        "packuswb                         %%mm6, %%mm4      \n\t"\
137
        MOVNTQ(%%mm4, (%1, %%REGa))\
138
        "add                                 $8, %%"REG_a"  \n\t"\
139
        "cmp                                 %2, %%"REG_a"  \n\t"\
140
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
141
        "pxor                             %%mm4, %%mm4      \n\t"\
142
        "pxor                             %%mm5, %%mm5      \n\t"\
143
        "pxor                             %%mm6, %%mm6      \n\t"\
144
        "pxor                             %%mm7, %%mm7      \n\t"\
145
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
146
        "jb                                  1b             \n\t"\
147
        :: "r" (&c->redDither),\
148
        "r" (dest), "g" (width)\
149
        : "%"REG_a, "%"REG_d, "%"REG_S\
150
    );
151

    
152
#define YSCALEYUV2YV121 \
153
    "mov %2, %%"REG_a"                    \n\t"\
154
    ASMALIGN(4) /* FIXME Unroll? */\
155
    "1:                                   \n\t"\
156
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
157
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
158
    "psraw                 $7, %%mm0      \n\t"\
159
    "psraw                 $7, %%mm1      \n\t"\
160
    "packuswb           %%mm1, %%mm0      \n\t"\
161
    MOVNTQ(%%mm0, (%1, %%REGa))\
162
    "add                   $8, %%"REG_a"  \n\t"\
163
    "jnc                   1b             \n\t"
164

    
165
#define YSCALEYUV2YV121_ACCURATE \
166
    "mov %2, %%"REG_a"                    \n\t"\
167
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
168
    "psrlw                 $15, %%mm7     \n\t"\
169
    "psllw                  $6, %%mm7     \n\t"\
170
    ASMALIGN(4) /* FIXME Unroll? */\
171
    "1:                                   \n\t"\
172
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
173
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
174
    "paddsw             %%mm7, %%mm0      \n\t"\
175
    "paddsw             %%mm7, %%mm1      \n\t"\
176
    "psraw                 $7, %%mm0      \n\t"\
177
    "psraw                 $7, %%mm1      \n\t"\
178
    "packuswb           %%mm1, %%mm0      \n\t"\
179
    MOVNTQ(%%mm0, (%1, %%REGa))\
180
    "add                   $8, %%"REG_a"  \n\t"\
181
    "jnc                   1b             \n\t"
182

    
183
/*
184
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186
       "r" (dest), "m" (dstW),
187
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
189
*/
190
#define YSCALEYUV2PACKEDX_UV \
191
    __asm__ volatile(\
192
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
193
        ASMALIGN(4)\
194
        "nop                                            \n\t"\
195
        "1:                                             \n\t"\
196
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
197
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
198
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
199
        "movq                      %%mm3, %%mm4         \n\t"\
200
        ASMALIGN(4)\
201
        "2:                                             \n\t"\
202
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
203
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
204
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
205
        "add                         $16, %%"REG_d"     \n\t"\
206
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
207
        "pmulhw                    %%mm0, %%mm2         \n\t"\
208
        "pmulhw                    %%mm0, %%mm5         \n\t"\
209
        "paddw                     %%mm2, %%mm3         \n\t"\
210
        "paddw                     %%mm5, %%mm4         \n\t"\
211
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
212
        " jnz                         2b                \n\t"\
213

    
214
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
218
    "movq                    "#dst1", "#dst2"       \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
222
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
223
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
224
    "add                         $16, %%"REG_d"            \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                 "#coeff", "#src1"       \n\t"\
227
    "pmulhw                 "#coeff", "#src2"       \n\t"\
228
    "paddw                   "#src1", "#dst1"       \n\t"\
229
    "paddw                   "#src2", "#dst2"       \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX \
234
    YSCALEYUV2PACKEDX_UV \
235
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
236

    
237
#define YSCALEYUV2PACKEDX_END                     \
238
        :: "r" (&c->redDither),                   \
239
            "m" (dummy), "m" (dummy), "m" (dummy),\
240
            "r" (dest), "m" (dstW)                \
241
        : "%"REG_a, "%"REG_d, "%"REG_S            \
242
    );
243

    
244
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
245
    __asm__ volatile(\
246
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
247
        ASMALIGN(4)\
248
        "nop                                            \n\t"\
249
        "1:                                             \n\t"\
250
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
251
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
252
        "pxor                      %%mm4, %%mm4         \n\t"\
253
        "pxor                      %%mm5, %%mm5         \n\t"\
254
        "pxor                      %%mm6, %%mm6         \n\t"\
255
        "pxor                      %%mm7, %%mm7         \n\t"\
256
        ASMALIGN(4)\
257
        "2:                                             \n\t"\
258
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
259
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
260
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
261
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
262
        "movq                      %%mm0, %%mm3         \n\t"\
263
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
264
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
265
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
266
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
267
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
268
        "paddd                     %%mm0, %%mm4         \n\t"\
269
        "paddd                     %%mm3, %%mm5         \n\t"\
270
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
271
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
272
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
273
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
274
        "movq                      %%mm2, %%mm0         \n\t"\
275
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
276
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
277
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
278
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
279
        "paddd                     %%mm2, %%mm6         \n\t"\
280
        "paddd                     %%mm0, %%mm7         \n\t"\
281
        " jnz                         2b                \n\t"\
282
        "psrad                       $16, %%mm4         \n\t"\
283
        "psrad                       $16, %%mm5         \n\t"\
284
        "psrad                       $16, %%mm6         \n\t"\
285
        "psrad                       $16, %%mm7         \n\t"\
286
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
287
        "packssdw                  %%mm5, %%mm4         \n\t"\
288
        "packssdw                  %%mm7, %%mm6         \n\t"\
289
        "paddw                     %%mm0, %%mm4         \n\t"\
290
        "paddw                     %%mm0, %%mm6         \n\t"\
291
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
292
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
293

    
294
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
296
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
297
    "pxor                      %%mm1, %%mm1         \n\t"\
298
    "pxor                      %%mm5, %%mm5         \n\t"\
299
    "pxor                      %%mm7, %%mm7         \n\t"\
300
    "pxor                      %%mm6, %%mm6         \n\t"\
301
    ASMALIGN(4)\
302
    "2:                                             \n\t"\
303
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
304
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
305
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
306
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
307
    "movq                      %%mm0, %%mm3         \n\t"\
308
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
309
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
310
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
311
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
312
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
313
    "paddd                     %%mm0, %%mm1         \n\t"\
314
    "paddd                     %%mm3, %%mm5         \n\t"\
315
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
316
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
317
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
318
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
319
    "movq                      %%mm2, %%mm0         \n\t"\
320
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
321
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
322
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
323
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
324
    "paddd                     %%mm2, %%mm7         \n\t"\
325
    "paddd                     %%mm0, %%mm6         \n\t"\
326
    " jnz                         2b                \n\t"\
327
    "psrad                       $16, %%mm1         \n\t"\
328
    "psrad                       $16, %%mm5         \n\t"\
329
    "psrad                       $16, %%mm7         \n\t"\
330
    "psrad                       $16, %%mm6         \n\t"\
331
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
332
    "packssdw                  %%mm5, %%mm1         \n\t"\
333
    "packssdw                  %%mm6, %%mm7         \n\t"\
334
    "paddw                     %%mm0, %%mm1         \n\t"\
335
    "paddw                     %%mm0, %%mm7         \n\t"\
336
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
337
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
338

    
339
#define YSCALEYUV2PACKEDX_ACCURATE \
340
    YSCALEYUV2PACKEDX_ACCURATE_UV \
341
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
342

    
343
#define YSCALEYUV2RGBX \
344
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
345
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
346
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
347
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
348
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
349
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
350
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
352
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
353
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
354
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
355
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
356
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
357
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358
    "paddw           %%mm3, %%mm4       \n\t"\
359
    "movq            %%mm2, %%mm0       \n\t"\
360
    "movq            %%mm5, %%mm6       \n\t"\
361
    "movq            %%mm4, %%mm3       \n\t"\
362
    "punpcklwd       %%mm2, %%mm2       \n\t"\
363
    "punpcklwd       %%mm5, %%mm5       \n\t"\
364
    "punpcklwd       %%mm4, %%mm4       \n\t"\
365
    "paddw           %%mm1, %%mm2       \n\t"\
366
    "paddw           %%mm1, %%mm5       \n\t"\
367
    "paddw           %%mm1, %%mm4       \n\t"\
368
    "punpckhwd       %%mm0, %%mm0       \n\t"\
369
    "punpckhwd       %%mm6, %%mm6       \n\t"\
370
    "punpckhwd       %%mm3, %%mm3       \n\t"\
371
    "paddw           %%mm7, %%mm0       \n\t"\
372
    "paddw           %%mm7, %%mm6       \n\t"\
373
    "paddw           %%mm7, %%mm3       \n\t"\
374
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375
    "packuswb        %%mm0, %%mm2       \n\t"\
376
    "packuswb        %%mm6, %%mm5       \n\t"\
377
    "packuswb        %%mm3, %%mm4       \n\t"\
378

    
379
#define REAL_YSCALEYUV2PACKED(index, c) \
380
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
381
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
382
    "psraw                $3, %%mm0                           \n\t"\
383
    "psraw                $3, %%mm1                           \n\t"\
384
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386
    "xor            "#index", "#index"                        \n\t"\
387
    ASMALIGN(4)\
388
    "1:                                 \n\t"\
389
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
390
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
391
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
392
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
393
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
396
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
403
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
404
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
405
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
406
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
407
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
408
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
414

    
415
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
416

    
417
#define REAL_YSCALEYUV2RGB_UV(index, c) \
418
    "xor            "#index", "#index"  \n\t"\
419
    ASMALIGN(4)\
420
    "1:                                 \n\t"\
421
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
422
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
423
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
424
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
425
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
428
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
435
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
436
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
437
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
438
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
439
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
440
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
441

    
442
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
444
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
445
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
446
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
447
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
448
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
449
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
455

    
456
#define REAL_YSCALEYUV2RGB_COEFF(c) \
457
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
458
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
459
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
460
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
461
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
462
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
463
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464
    "paddw             %%mm3, %%mm4     \n\t"\
465
    "movq              %%mm2, %%mm0     \n\t"\
466
    "movq              %%mm5, %%mm6     \n\t"\
467
    "movq              %%mm4, %%mm3     \n\t"\
468
    "punpcklwd         %%mm2, %%mm2     \n\t"\
469
    "punpcklwd         %%mm5, %%mm5     \n\t"\
470
    "punpcklwd         %%mm4, %%mm4     \n\t"\
471
    "paddw             %%mm1, %%mm2     \n\t"\
472
    "paddw             %%mm1, %%mm5     \n\t"\
473
    "paddw             %%mm1, %%mm4     \n\t"\
474
    "punpckhwd         %%mm0, %%mm0     \n\t"\
475
    "punpckhwd         %%mm6, %%mm6     \n\t"\
476
    "punpckhwd         %%mm3, %%mm3     \n\t"\
477
    "paddw             %%mm7, %%mm0     \n\t"\
478
    "paddw             %%mm7, %%mm6     \n\t"\
479
    "paddw             %%mm7, %%mm3     \n\t"\
480
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481
    "packuswb          %%mm0, %%mm2     \n\t"\
482
    "packuswb          %%mm6, %%mm5     \n\t"\
483
    "packuswb          %%mm3, %%mm4     \n\t"\
484

    
485
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
486

    
487
#define YSCALEYUV2RGB(index, c) \
488
    REAL_YSCALEYUV2RGB_UV(index, c) \
489
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490
    REAL_YSCALEYUV2RGB_COEFF(c)
491

    
492
#define REAL_YSCALEYUV2PACKED1(index, c) \
493
    "xor            "#index", "#index"  \n\t"\
494
    ASMALIGN(4)\
495
    "1:                                 \n\t"\
496
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
497
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
498
    "psraw                $7, %%mm3     \n\t" \
499
    "psraw                $7, %%mm4     \n\t" \
500
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
501
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
502
    "psraw                $7, %%mm1     \n\t" \
503
    "psraw                $7, %%mm7     \n\t" \
504

    
505
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
506

    
507
#define REAL_YSCALEYUV2RGB1(index, c) \
508
    "xor            "#index", "#index"  \n\t"\
509
    ASMALIGN(4)\
510
    "1:                                 \n\t"\
511
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
512
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
513
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
516
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
517
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
518
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
519
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
520
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
521
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
523
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
524
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
527
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
528
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
529
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
530
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
531
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
532
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533
    "paddw             %%mm3, %%mm4     \n\t"\
534
    "movq              %%mm2, %%mm0     \n\t"\
535
    "movq              %%mm5, %%mm6     \n\t"\
536
    "movq              %%mm4, %%mm3     \n\t"\
537
    "punpcklwd         %%mm2, %%mm2     \n\t"\
538
    "punpcklwd         %%mm5, %%mm5     \n\t"\
539
    "punpcklwd         %%mm4, %%mm4     \n\t"\
540
    "paddw             %%mm1, %%mm2     \n\t"\
541
    "paddw             %%mm1, %%mm5     \n\t"\
542
    "paddw             %%mm1, %%mm4     \n\t"\
543
    "punpckhwd         %%mm0, %%mm0     \n\t"\
544
    "punpckhwd         %%mm6, %%mm6     \n\t"\
545
    "punpckhwd         %%mm3, %%mm3     \n\t"\
546
    "paddw             %%mm7, %%mm0     \n\t"\
547
    "paddw             %%mm7, %%mm6     \n\t"\
548
    "paddw             %%mm7, %%mm3     \n\t"\
549
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550
    "packuswb          %%mm0, %%mm2     \n\t"\
551
    "packuswb          %%mm6, %%mm5     \n\t"\
552
    "packuswb          %%mm3, %%mm4     \n\t"\
553

    
554
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
555

    
556
#define REAL_YSCALEYUV2PACKED1b(index, c) \
557
    "xor "#index", "#index"             \n\t"\
558
    ASMALIGN(4)\
559
    "1:                                 \n\t"\
560
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
561
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
562
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
563
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
564
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566
    "psrlw                $8, %%mm3     \n\t" \
567
    "psrlw                $8, %%mm4     \n\t" \
568
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
569
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
570
    "psraw                $7, %%mm1     \n\t" \
571
    "psraw                $7, %%mm7     \n\t"
572
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
573

    
574
// do vertical chrominance interpolation
575
#define REAL_YSCALEYUV2RGB1b(index, c) \
576
    "xor            "#index", "#index"  \n\t"\
577
    ASMALIGN(4)\
578
    "1:                                 \n\t"\
579
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
580
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
581
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
582
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
583
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
586
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
587
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
588
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
589
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
590
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
591
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
592
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
593
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
595
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
596
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
599
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
600
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
601
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
602
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
603
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
604
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605
    "paddw             %%mm3, %%mm4     \n\t"\
606
    "movq              %%mm2, %%mm0     \n\t"\
607
    "movq              %%mm5, %%mm6     \n\t"\
608
    "movq              %%mm4, %%mm3     \n\t"\
609
    "punpcklwd         %%mm2, %%mm2     \n\t"\
610
    "punpcklwd         %%mm5, %%mm5     \n\t"\
611
    "punpcklwd         %%mm4, %%mm4     \n\t"\
612
    "paddw             %%mm1, %%mm2     \n\t"\
613
    "paddw             %%mm1, %%mm5     \n\t"\
614
    "paddw             %%mm1, %%mm4     \n\t"\
615
    "punpckhwd         %%mm0, %%mm0     \n\t"\
616
    "punpckhwd         %%mm6, %%mm6     \n\t"\
617
    "punpckhwd         %%mm3, %%mm3     \n\t"\
618
    "paddw             %%mm7, %%mm0     \n\t"\
619
    "paddw             %%mm7, %%mm6     \n\t"\
620
    "paddw             %%mm7, %%mm3     \n\t"\
621
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622
    "packuswb          %%mm0, %%mm2     \n\t"\
623
    "packuswb          %%mm6, %%mm5     \n\t"\
624
    "packuswb          %%mm3, %%mm4     \n\t"\
625

    
626
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
627

    
628
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
630
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
631
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
632
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
633
    "packuswb          %%mm1, %%mm7     \n\t"
634
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
635

    
636
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637
    "movq       "#b", "#q2"     \n\t" /* B */\
638
    "movq       "#r", "#t"      \n\t" /* R */\
639
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
640
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
641
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
642
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
643
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
644
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
645
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
646
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
647
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
648
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
649
\
650
    MOVNTQ(   q0,   (dst, index, 4))\
651
    MOVNTQ(    b,  8(dst, index, 4))\
652
    MOVNTQ(   q2, 16(dst, index, 4))\
653
    MOVNTQ(   q3, 24(dst, index, 4))\
654
\
655
    "add      $8, "#index"      \n\t"\
656
    "cmp "#dstw", "#index"      \n\t"\
657
    " jb      1b                \n\t"
658
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
659

    
660
#define REAL_WRITERGB16(dst, dstw, index) \
661
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
662
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
663
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
664
    "psrlq           $3, %%mm2  \n\t"\
665
\
666
    "movq         %%mm2, %%mm1  \n\t"\
667
    "movq         %%mm4, %%mm3  \n\t"\
668
\
669
    "punpcklbw    %%mm7, %%mm3  \n\t"\
670
    "punpcklbw    %%mm5, %%mm2  \n\t"\
671
    "punpckhbw    %%mm7, %%mm4  \n\t"\
672
    "punpckhbw    %%mm5, %%mm1  \n\t"\
673
\
674
    "psllq           $3, %%mm3  \n\t"\
675
    "psllq           $3, %%mm4  \n\t"\
676
\
677
    "por          %%mm3, %%mm2  \n\t"\
678
    "por          %%mm4, %%mm1  \n\t"\
679
\
680
    MOVNTQ(%%mm2,  (dst, index, 2))\
681
    MOVNTQ(%%mm1, 8(dst, index, 2))\
682
\
683
    "add             $8, "#index"   \n\t"\
684
    "cmp        "#dstw", "#index"   \n\t"\
685
    " jb             1b             \n\t"
686
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
687

    
688
#define REAL_WRITERGB15(dst, dstw, index) \
689
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
690
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
691
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
692
    "psrlq           $3, %%mm2  \n\t"\
693
    "psrlq           $1, %%mm5  \n\t"\
694
\
695
    "movq         %%mm2, %%mm1  \n\t"\
696
    "movq         %%mm4, %%mm3  \n\t"\
697
\
698
    "punpcklbw    %%mm7, %%mm3  \n\t"\
699
    "punpcklbw    %%mm5, %%mm2  \n\t"\
700
    "punpckhbw    %%mm7, %%mm4  \n\t"\
701
    "punpckhbw    %%mm5, %%mm1  \n\t"\
702
\
703
    "psllq           $2, %%mm3  \n\t"\
704
    "psllq           $2, %%mm4  \n\t"\
705
\
706
    "por          %%mm3, %%mm2  \n\t"\
707
    "por          %%mm4, %%mm1  \n\t"\
708
\
709
    MOVNTQ(%%mm2,  (dst, index, 2))\
710
    MOVNTQ(%%mm1, 8(dst, index, 2))\
711
\
712
    "add             $8, "#index"   \n\t"\
713
    "cmp        "#dstw", "#index"   \n\t"\
714
    " jb             1b             \n\t"
715
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
716

    
717
#define WRITEBGR24OLD(dst, dstw, index) \
718
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719
    "movq      %%mm2, %%mm1             \n\t" /* B */\
720
    "movq      %%mm5, %%mm6             \n\t" /* R */\
721
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
722
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
723
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
724
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
725
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
726
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
727
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
728
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
729
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
730
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
731
\
732
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
733
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
734
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
735
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
736
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
737
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
738
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
739
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
740
\
741
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
742
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
743
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
744
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
745
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
746
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
747
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
748
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
749
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
750
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
751
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
752
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
753
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
754
\
755
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
756
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
757
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
758
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
759
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
760
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
761
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
762
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
763
\
764
    MOVNTQ(%%mm0,   (dst))\
765
    MOVNTQ(%%mm2,  8(dst))\
766
    MOVNTQ(%%mm3, 16(dst))\
767
    "add         $24, "#dst"            \n\t"\
768
\
769
    "add          $8, "#index"          \n\t"\
770
    "cmp     "#dstw", "#index"          \n\t"\
771
    " jb          1b                    \n\t"
772

    
773
#define WRITEBGR24MMX(dst, dstw, index) \
774
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775
    "movq      %%mm2, %%mm1     \n\t" /* B */\
776
    "movq      %%mm5, %%mm6     \n\t" /* R */\
777
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
778
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
779
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
780
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
781
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
782
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
783
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
784
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
785
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
786
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
787
\
788
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
789
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
790
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
791
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
792
\
793
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
794
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
795
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
796
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
797
\
798
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
799
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
800
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
801
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
802
\
803
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
804
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
805
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
806
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
807
    MOVNTQ(%%mm0, (dst))\
808
\
809
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
810
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
811
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
812
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
813
    MOVNTQ(%%mm6, 8(dst))\
814
\
815
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
816
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
817
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
818
    MOVNTQ(%%mm5, 16(dst))\
819
\
820
    "add         $24, "#dst"    \n\t"\
821
\
822
    "add          $8, "#index"  \n\t"\
823
    "cmp     "#dstw", "#index"  \n\t"\
824
    " jb          1b            \n\t"
825

    
826
#define WRITEBGR24MMX2(dst, dstw, index) \
827
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
831
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
832
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
833
\
834
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
835
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
836
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
837
\
838
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
839
    "por    %%mm1, %%mm6        \n\t"\
840
    "por    %%mm3, %%mm6        \n\t"\
841
    MOVNTQ(%%mm6, (dst))\
842
\
843
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
844
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
845
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
846
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
847
\
848
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
849
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
850
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
851
\
852
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
853
    "por    %%mm3, %%mm6        \n\t"\
854
    MOVNTQ(%%mm6, 8(dst))\
855
\
856
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
857
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
858
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
859
\
860
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
861
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
862
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
863
\
864
    "por    %%mm1, %%mm3        \n\t"\
865
    "por    %%mm3, %%mm6        \n\t"\
866
    MOVNTQ(%%mm6, 16(dst))\
867
\
868
    "add      $24, "#dst"       \n\t"\
869
\
870
    "add       $8, "#index"     \n\t"\
871
    "cmp  "#dstw", "#index"     \n\t"\
872
    " jb       1b               \n\t"
873

    
874
#if COMPILE_TEMPLATE_MMX2
875
#undef WRITEBGR24
876
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
877
#else
878
#undef WRITEBGR24
879
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
880
#endif
881

    
882
#define REAL_WRITEYUY2(dst, dstw, index) \
883
    "packuswb  %%mm3, %%mm3     \n\t"\
884
    "packuswb  %%mm4, %%mm4     \n\t"\
885
    "packuswb  %%mm7, %%mm1     \n\t"\
886
    "punpcklbw %%mm4, %%mm3     \n\t"\
887
    "movq      %%mm1, %%mm7     \n\t"\
888
    "punpcklbw %%mm3, %%mm1     \n\t"\
889
    "punpckhbw %%mm3, %%mm7     \n\t"\
890
\
891
    MOVNTQ(%%mm1, (dst, index, 2))\
892
    MOVNTQ(%%mm7, 8(dst, index, 2))\
893
\
894
    "add          $8, "#index"  \n\t"\
895
    "cmp     "#dstw", "#index"  \n\t"\
896
    " jb          1b            \n\t"
897
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
898

    
899

    
900
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
902
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
903
{
904
#if COMPILE_TEMPLATE_MMX
905
    if(!(c->flags & SWS_BITEXACT)) {
906
        if (c->flags & SWS_ACCURATE_RND) {
907
            if (uDest) {
908
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
910
            }
911
            if (CONFIG_SWSCALE_ALPHA && aDest) {
912
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
913
            }
914

    
915
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
916
        } else {
917
            if (uDest) {
918
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
920
            }
921
            if (CONFIG_SWSCALE_ALPHA && aDest) {
922
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
923
            }
924

    
925
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
926
        }
927
        return;
928
    }
929
#endif
930
#if COMPILE_TEMPLATE_ALTIVEC
931
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932
                          chrFilter, chrSrc, chrFilterSize,
933
                          dest, uDest, vDest, dstW, chrDstW);
934
#else //COMPILE_TEMPLATE_ALTIVEC
935
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936
                chrFilter, chrSrc, chrFilterSize,
937
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
938
#endif //!COMPILE_TEMPLATE_ALTIVEC
939
}
940

    
941
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
943
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
944
{
945
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946
                 chrFilter, chrSrc, chrFilterSize,
947
                 dest, uDest, dstW, chrDstW, dstFormat);
948
}
949

    
950
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
951
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
952
{
953
    int i;
954
#if COMPILE_TEMPLATE_MMX
955
    if(!(c->flags & SWS_BITEXACT)) {
956
        long p= 4;
957
        const uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
958
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
960

    
961
        if (c->flags & SWS_ACCURATE_RND) {
962
            while(p--) {
963
                if (dst[p]) {
964
                    __asm__ volatile(
965
                        YSCALEYUV2YV121_ACCURATE
966
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
967
                        "g" (-counter[p])
968
                        : "%"REG_a
969
                    );
970
                }
971
            }
972
        } else {
973
            while(p--) {
974
                if (dst[p]) {
975
                    __asm__ volatile(
976
                        YSCALEYUV2YV121
977
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
978
                        "g" (-counter[p])
979
                        : "%"REG_a
980
                    );
981
                }
982
            }
983
        }
984
        return;
985
    }
986
#endif
987
    for (i=0; i<dstW; i++) {
988
        int val= (lumSrc[i]+64)>>7;
989

    
990
        if (val&256) {
991
            if (val<0) val=0;
992
            else       val=255;
993
        }
994

    
995
        dest[i]= val;
996
    }
997

    
998
    if (uDest)
999
        for (i=0; i<chrDstW; i++) {
1000
            int u=(chrSrc[i       ]+64)>>7;
1001
            int v=(chrSrc[i + VOFW]+64)>>7;
1002

    
1003
            if ((u|v)&256) {
1004
                if (u<0)        u=0;
1005
                else if (u>255) u=255;
1006
                if (v<0)        v=0;
1007
                else if (v>255) v=255;
1008
            }
1009

    
1010
            uDest[i]= u;
1011
            vDest[i]= v;
1012
        }
1013

    
1014
    if (CONFIG_SWSCALE_ALPHA && aDest)
1015
        for (i=0; i<dstW; i++) {
1016
            int val= (alpSrc[i]+64)>>7;
1017
            aDest[i]= av_clip_uint8(val);
1018
        }
1019
}
1020

    
1021

    
1022
/**
1023
 * vertical scale YV12 to RGB
1024
 */
1025
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1028
{
1029
#if COMPILE_TEMPLATE_MMX
1030
    x86_reg dummy=0;
1031
    if(!(c->flags & SWS_BITEXACT)) {
1032
        if (c->flags & SWS_ACCURATE_RND) {
1033
            switch(c->dstFormat) {
1034
            case PIX_FMT_RGB32:
1035
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1036
                    YSCALEYUV2PACKEDX_ACCURATE
1037
                    YSCALEYUV2RGBX
1038
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1039
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1040
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1041
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1043
                    "psraw                        $3, %%mm1         \n\t"
1044
                    "psraw                        $3, %%mm7         \n\t"
1045
                    "packuswb                  %%mm7, %%mm1         \n\t"
1046
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1047

    
1048
                    YSCALEYUV2PACKEDX_END
1049
                } else {
1050
                    YSCALEYUV2PACKEDX_ACCURATE
1051
                    YSCALEYUV2RGBX
1052
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1053
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1054

    
1055
                    YSCALEYUV2PACKEDX_END
1056
                }
1057
                return;
1058
            case PIX_FMT_BGR24:
1059
                YSCALEYUV2PACKEDX_ACCURATE
1060
                YSCALEYUV2RGBX
1061
                "pxor %%mm7, %%mm7 \n\t"
1062
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063
                "add %4, %%"REG_c"                        \n\t"
1064
                WRITEBGR24(%%REGc, %5, %%REGa)
1065

    
1066

    
1067
                :: "r" (&c->redDither),
1068
                "m" (dummy), "m" (dummy), "m" (dummy),
1069
                "r" (dest), "m" (dstW)
1070
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1071
                );
1072
                return;
1073
            case PIX_FMT_RGB555:
1074
                YSCALEYUV2PACKEDX_ACCURATE
1075
                YSCALEYUV2RGBX
1076
                "pxor %%mm7, %%mm7 \n\t"
1077
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078
#ifdef DITHER1XBPP
1079
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1082
#endif
1083

    
1084
                WRITERGB15(%4, %5, %%REGa)
1085
                YSCALEYUV2PACKEDX_END
1086
                return;
1087
            case PIX_FMT_RGB565:
1088
                YSCALEYUV2PACKEDX_ACCURATE
1089
                YSCALEYUV2RGBX
1090
                "pxor %%mm7, %%mm7 \n\t"
1091
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1092
#ifdef DITHER1XBPP
1093
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1096
#endif
1097

    
1098
                WRITERGB16(%4, %5, %%REGa)
1099
                YSCALEYUV2PACKEDX_END
1100
                return;
1101
            case PIX_FMT_YUYV422:
1102
                YSCALEYUV2PACKEDX_ACCURATE
1103
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1104

    
1105
                "psraw $3, %%mm3    \n\t"
1106
                "psraw $3, %%mm4    \n\t"
1107
                "psraw $3, %%mm1    \n\t"
1108
                "psraw $3, %%mm7    \n\t"
1109
                WRITEYUY2(%4, %5, %%REGa)
1110
                YSCALEYUV2PACKEDX_END
1111
                return;
1112
            }
1113
        } else {
1114
            switch(c->dstFormat) {
1115
            case PIX_FMT_RGB32:
1116
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1117
                    YSCALEYUV2PACKEDX
1118
                    YSCALEYUV2RGBX
1119
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120
                    "psraw                        $3, %%mm1         \n\t"
1121
                    "psraw                        $3, %%mm7         \n\t"
1122
                    "packuswb                  %%mm7, %%mm1         \n\t"
1123
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124
                    YSCALEYUV2PACKEDX_END
1125
                } else {
1126
                    YSCALEYUV2PACKEDX
1127
                    YSCALEYUV2RGBX
1128
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1129
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130
                    YSCALEYUV2PACKEDX_END
1131
                }
1132
                return;
1133
            case PIX_FMT_BGR24:
1134
                YSCALEYUV2PACKEDX
1135
                YSCALEYUV2RGBX
1136
                "pxor                    %%mm7, %%mm7       \n\t"
1137
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1138
                "add                        %4, %%"REG_c"   \n\t"
1139
                WRITEBGR24(%%REGc, %5, %%REGa)
1140

    
1141
                :: "r" (&c->redDither),
1142
                "m" (dummy), "m" (dummy), "m" (dummy),
1143
                "r" (dest),  "m" (dstW)
1144
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145
                );
1146
                return;
1147
            case PIX_FMT_RGB555:
1148
                YSCALEYUV2PACKEDX
1149
                YSCALEYUV2RGBX
1150
                "pxor %%mm7, %%mm7 \n\t"
1151
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1152
#ifdef DITHER1XBPP
1153
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1154
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1155
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1156
#endif
1157

    
1158
                WRITERGB15(%4, %5, %%REGa)
1159
                YSCALEYUV2PACKEDX_END
1160
                return;
1161
            case PIX_FMT_RGB565:
1162
                YSCALEYUV2PACKEDX
1163
                YSCALEYUV2RGBX
1164
                "pxor %%mm7, %%mm7 \n\t"
1165
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166
#ifdef DITHER1XBPP
1167
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1168
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1169
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1170
#endif
1171

    
1172
                WRITERGB16(%4, %5, %%REGa)
1173
                YSCALEYUV2PACKEDX_END
1174
                return;
1175
            case PIX_FMT_YUYV422:
1176
                YSCALEYUV2PACKEDX
1177
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178

    
1179
                "psraw $3, %%mm3    \n\t"
1180
                "psraw $3, %%mm4    \n\t"
1181
                "psraw $3, %%mm1    \n\t"
1182
                "psraw $3, %%mm7    \n\t"
1183
                WRITEYUY2(%4, %5, %%REGa)
1184
                YSCALEYUV2PACKEDX_END
1185
                return;
1186
            }
1187
        }
1188
    }
1189
#endif /* COMPILE_TEMPLATE_MMX */
1190
#if COMPILE_TEMPLATE_ALTIVEC
1191
    /* The following list of supported dstFormat values should
1192
       match what's found in the body of ff_yuv2packedX_altivec() */
1193
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1194
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1195
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1197
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198
                                   chrFilter, chrSrc, chrFilterSize,
1199
                                   dest, dstW, dstY);
1200
    else
1201
#endif
1202
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203
                       chrFilter, chrSrc, chrFilterSize,
1204
                       alpSrc, dest, dstW, dstY);
1205
}
1206

    
1207
/**
1208
 * vertical bilinear scale YV12 to RGB
1209
 */
1210
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1212
{
1213
    int  yalpha1=4095- yalpha;
1214
    int uvalpha1=4095-uvalpha;
1215
    int i;
1216

    
1217
#if COMPILE_TEMPLATE_MMX
1218
    if(!(c->flags & SWS_BITEXACT)) {
1219
        switch(c->dstFormat) {
1220
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221
        case PIX_FMT_RGB32:
1222
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1223
#if ARCH_X86_64
1224
                __asm__ volatile(
1225
                    YSCALEYUV2RGB(%%r8, %5)
1226
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1227
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229
                    "packuswb            %%mm7, %%mm1       \n\t"
1230
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1231

    
1232
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1233
                    "a" (&c->redDither)
1234
                    ,"r" (abuf0), "r" (abuf1)
1235
                    : "%r8"
1236
                );
1237
#else
1238
                *(const uint16_t **)(&c->u_temp)=abuf0;
1239
                *(const uint16_t **)(&c->v_temp)=abuf1;
1240
                __asm__ volatile(
1241
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1242
                    "mov        %4, %%"REG_b"               \n\t"
1243
                    "push %%"REG_BP"                        \n\t"
1244
                    YSCALEYUV2RGB(%%REGBP, %5)
1245
                    "push                   %0              \n\t"
1246
                    "push                   %1              \n\t"
1247
                    "mov          "U_TEMP"(%5), %0          \n\t"
1248
                    "mov          "V_TEMP"(%5), %1          \n\t"
1249
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252
                    "packuswb            %%mm7, %%mm1       \n\t"
1253
                    "pop                    %1              \n\t"
1254
                    "pop                    %0              \n\t"
1255
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256
                    "pop %%"REG_BP"                         \n\t"
1257
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1258

    
1259
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1260
                    "a" (&c->redDither)
1261
                );
1262
#endif
1263
            } else {
1264
                __asm__ volatile(
1265
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1266
                    "mov        %4, %%"REG_b"               \n\t"
1267
                    "push %%"REG_BP"                        \n\t"
1268
                    YSCALEYUV2RGB(%%REGBP, %5)
1269
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1270
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271
                    "pop %%"REG_BP"                         \n\t"
1272
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1273

    
1274
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275
                    "a" (&c->redDither)
1276
                );
1277
            }
1278
            return;
1279
        case PIX_FMT_BGR24:
1280
            __asm__ volatile(
1281
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1282
                "mov        %4, %%"REG_b"               \n\t"
1283
                "push %%"REG_BP"                        \n\t"
1284
                YSCALEYUV2RGB(%%REGBP, %5)
1285
                "pxor    %%mm7, %%mm7                   \n\t"
1286
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287
                "pop %%"REG_BP"                         \n\t"
1288
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1289
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1290
                "a" (&c->redDither)
1291
            );
1292
            return;
1293
        case PIX_FMT_RGB555:
1294
            __asm__ volatile(
1295
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1296
                "mov        %4, %%"REG_b"               \n\t"
1297
                "push %%"REG_BP"                        \n\t"
1298
                YSCALEYUV2RGB(%%REGBP, %5)
1299
                "pxor    %%mm7, %%mm7                   \n\t"
1300
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1301
#ifdef DITHER1XBPP
1302
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1303
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1304
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1305
#endif
1306

    
1307
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1308
                "pop %%"REG_BP"                         \n\t"
1309
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1310

    
1311
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1312
                "a" (&c->redDither)
1313
            );
1314
            return;
1315
        case PIX_FMT_RGB565:
1316
            __asm__ volatile(
1317
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1318
                "mov        %4, %%"REG_b"               \n\t"
1319
                "push %%"REG_BP"                        \n\t"
1320
                YSCALEYUV2RGB(%%REGBP, %5)
1321
                "pxor    %%mm7, %%mm7                   \n\t"
1322
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1323
#ifdef DITHER1XBPP
1324
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1325
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1326
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1327
#endif
1328

    
1329
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1330
                "pop %%"REG_BP"                         \n\t"
1331
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1332
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1333
                "a" (&c->redDither)
1334
            );
1335
            return;
1336
        case PIX_FMT_YUYV422:
1337
            __asm__ volatile(
1338
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1339
                "mov %4, %%"REG_b"                        \n\t"
1340
                "push %%"REG_BP"                        \n\t"
1341
                YSCALEYUV2PACKED(%%REGBP, %5)
1342
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343
                "pop %%"REG_BP"                         \n\t"
1344
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1345
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346
                "a" (&c->redDither)
1347
            );
1348
            return;
1349
        default: break;
1350
        }
1351
    }
1352
#endif //COMPILE_TEMPLATE_MMX
1353
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1354
}
1355

    
1356
/**
1357
 * YV12 to RGB without scaling or interpolating
1358
 */
1359
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1360
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1361
{
1362
    const int yalpha1=0;
1363
    int i;
1364

    
1365
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1366
    const int yalpha= 4096; //FIXME ...
1367

    
1368
    if (flags&SWS_FULL_CHR_H_INT) {
1369
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1370
        return;
1371
    }
1372

    
1373
#if COMPILE_TEMPLATE_MMX
1374
    if(!(flags & SWS_BITEXACT)) {
1375
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376
            switch(dstFormat) {
1377
            case PIX_FMT_RGB32:
1378
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1379
                    __asm__ volatile(
1380
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1381
                        "mov        %4, %%"REG_b"               \n\t"
1382
                        "push %%"REG_BP"                        \n\t"
1383
                        YSCALEYUV2RGB1(%%REGBP, %5)
1384
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386
                        "pop %%"REG_BP"                         \n\t"
1387
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1388

    
1389
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1390
                        "a" (&c->redDither)
1391
                    );
1392
                } else {
1393
                    __asm__ volatile(
1394
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1395
                        "mov        %4, %%"REG_b"               \n\t"
1396
                        "push %%"REG_BP"                        \n\t"
1397
                        YSCALEYUV2RGB1(%%REGBP, %5)
1398
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1399
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400
                        "pop %%"REG_BP"                         \n\t"
1401
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1402

    
1403
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1404
                        "a" (&c->redDither)
1405
                    );
1406
                }
1407
                return;
1408
            case PIX_FMT_BGR24:
1409
                __asm__ volatile(
1410
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1411
                    "mov        %4, %%"REG_b"               \n\t"
1412
                    "push %%"REG_BP"                        \n\t"
1413
                    YSCALEYUV2RGB1(%%REGBP, %5)
1414
                    "pxor    %%mm7, %%mm7                   \n\t"
1415
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1416
                    "pop %%"REG_BP"                         \n\t"
1417
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1418

    
1419
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420
                    "a" (&c->redDither)
1421
                );
1422
                return;
1423
            case PIX_FMT_RGB555:
1424
                __asm__ volatile(
1425
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1426
                    "mov        %4, %%"REG_b"               \n\t"
1427
                    "push %%"REG_BP"                        \n\t"
1428
                    YSCALEYUV2RGB1(%%REGBP, %5)
1429
                    "pxor    %%mm7, %%mm7                   \n\t"
1430
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1431
#ifdef DITHER1XBPP
1432
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1433
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1434
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1435
#endif
1436
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437
                    "pop %%"REG_BP"                         \n\t"
1438
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1439

    
1440
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441
                    "a" (&c->redDither)
1442
                );
1443
                return;
1444
            case PIX_FMT_RGB565:
1445
                __asm__ volatile(
1446
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1447
                    "mov        %4, %%"REG_b"               \n\t"
1448
                    "push %%"REG_BP"                        \n\t"
1449
                    YSCALEYUV2RGB1(%%REGBP, %5)
1450
                    "pxor    %%mm7, %%mm7                   \n\t"
1451
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452
#ifdef DITHER1XBPP
1453
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1454
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1455
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1456
#endif
1457

    
1458
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459
                    "pop %%"REG_BP"                         \n\t"
1460
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1461

    
1462
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463
                    "a" (&c->redDither)
1464
                );
1465
                return;
1466
            case PIX_FMT_YUYV422:
1467
                __asm__ volatile(
1468
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1469
                    "mov        %4, %%"REG_b"               \n\t"
1470
                    "push %%"REG_BP"                        \n\t"
1471
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1472
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473
                    "pop %%"REG_BP"                         \n\t"
1474
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1475

    
1476
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477
                    "a" (&c->redDither)
1478
                );
1479
                return;
1480
            }
1481
        } else {
1482
            switch(dstFormat) {
1483
            case PIX_FMT_RGB32:
1484
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1485
                    __asm__ volatile(
1486
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1487
                        "mov        %4, %%"REG_b"               \n\t"
1488
                        "push %%"REG_BP"                        \n\t"
1489
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1490
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492
                        "pop %%"REG_BP"                         \n\t"
1493
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1494

    
1495
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1496
                        "a" (&c->redDither)
1497
                    );
1498
                } else {
1499
                    __asm__ volatile(
1500
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                        "mov        %4, %%"REG_b"               \n\t"
1502
                        "push %%"REG_BP"                        \n\t"
1503
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1504
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1505
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506
                        "pop %%"REG_BP"                         \n\t"
1507
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1508

    
1509
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510
                        "a" (&c->redDither)
1511
                    );
1512
                }
1513
                return;
1514
            case PIX_FMT_BGR24:
1515
                __asm__ volatile(
1516
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1517
                    "mov        %4, %%"REG_b"               \n\t"
1518
                    "push %%"REG_BP"                        \n\t"
1519
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1520
                    "pxor    %%mm7, %%mm7                   \n\t"
1521
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1522
                    "pop %%"REG_BP"                         \n\t"
1523
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1524

    
1525
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526
                    "a" (&c->redDither)
1527
                );
1528
                return;
1529
            case PIX_FMT_RGB555:
1530
                __asm__ volatile(
1531
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1532
                    "mov        %4, %%"REG_b"               \n\t"
1533
                    "push %%"REG_BP"                        \n\t"
1534
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1535
                    "pxor    %%mm7, %%mm7                   \n\t"
1536
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1537
#ifdef DITHER1XBPP
1538
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1539
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1540
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1541
#endif
1542
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543
                    "pop %%"REG_BP"                         \n\t"
1544
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1545

    
1546
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547
                    "a" (&c->redDither)
1548
                );
1549
                return;
1550
            case PIX_FMT_RGB565:
1551
                __asm__ volatile(
1552
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1553
                    "mov        %4, %%"REG_b"               \n\t"
1554
                    "push %%"REG_BP"                        \n\t"
1555
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1556
                    "pxor    %%mm7, %%mm7                   \n\t"
1557
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1558
#ifdef DITHER1XBPP
1559
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1560
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1561
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1562
#endif
1563

    
1564
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565
                    "pop %%"REG_BP"                         \n\t"
1566
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1567

    
1568
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569
                    "a" (&c->redDither)
1570
                );
1571
                return;
1572
            case PIX_FMT_YUYV422:
1573
                __asm__ volatile(
1574
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1575
                    "mov        %4, %%"REG_b"               \n\t"
1576
                    "push %%"REG_BP"                        \n\t"
1577
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1578
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579
                    "pop %%"REG_BP"                         \n\t"
1580
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1581

    
1582
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583
                    "a" (&c->redDither)
1584
                );
1585
                return;
1586
            }
1587
        }
1588
    }
1589
#endif /* COMPILE_TEMPLATE_MMX */
1590
    if (uvalpha < 2048) {
1591
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592
    } else {
1593
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1594
    }
1595
}
1596

    
1597
//FIXME yuy2* can read up to 7 samples too much
1598

    
1599
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1600
{
1601
#if COMPILE_TEMPLATE_MMX
1602
    __asm__ volatile(
1603
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1604
        "mov                    %0, %%"REG_a"       \n\t"
1605
        "1:                                         \n\t"
1606
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1607
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1608
        "pand                %%mm2, %%mm0           \n\t"
1609
        "pand                %%mm2, %%mm1           \n\t"
1610
        "packuswb            %%mm1, %%mm0           \n\t"
1611
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1612
        "add                    $8, %%"REG_a"       \n\t"
1613
        " js                    1b                  \n\t"
1614
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1615
        : "%"REG_a
1616
    );
1617
#else
1618
    int i;
1619
    for (i=0; i<width; i++)
1620
        dst[i]= src[2*i];
1621
#endif
1622
}
1623

    
1624
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1625
{
1626
#if COMPILE_TEMPLATE_MMX
1627
    __asm__ volatile(
1628
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1629
        "mov                    %0, %%"REG_a"       \n\t"
1630
        "1:                                         \n\t"
1631
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1632
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1633
        "psrlw                  $8, %%mm0           \n\t"
1634
        "psrlw                  $8, %%mm1           \n\t"
1635
        "packuswb            %%mm1, %%mm0           \n\t"
1636
        "movq                %%mm0, %%mm1           \n\t"
1637
        "psrlw                  $8, %%mm0           \n\t"
1638
        "pand                %%mm4, %%mm1           \n\t"
1639
        "packuswb            %%mm0, %%mm0           \n\t"
1640
        "packuswb            %%mm1, %%mm1           \n\t"
1641
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1642
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1643
        "add                    $4, %%"REG_a"       \n\t"
1644
        " js                    1b                  \n\t"
1645
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1646
        : "%"REG_a
1647
    );
1648
#else
1649
    int i;
1650
    for (i=0; i<width; i++) {
1651
        dstU[i]= src1[4*i + 1];
1652
        dstV[i]= src1[4*i + 3];
1653
    }
1654
#endif
1655
    assert(src1 == src2);
1656
}
1657

    
1658
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1659
{
1660
#if COMPILE_TEMPLATE_MMX
1661
    __asm__ volatile(
1662
        "mov                    %0, %%"REG_a"       \n\t"
1663
        "1:                                         \n\t"
1664
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1665
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1666
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1667
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1668
        "psrlw                  $8, %%mm0           \n\t"
1669
        "psrlw                  $8, %%mm1           \n\t"
1670
        "psrlw                  $8, %%mm2           \n\t"
1671
        "psrlw                  $8, %%mm3           \n\t"
1672
        "packuswb            %%mm1, %%mm0           \n\t"
1673
        "packuswb            %%mm3, %%mm2           \n\t"
1674
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1675
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1676
        "add                    $8, %%"REG_a"       \n\t"
1677
        " js                    1b                  \n\t"
1678
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1679
        : "%"REG_a
1680
    );
1681
#else
1682
    int i;
1683
    for (i=0; i<width; i++) {
1684
        dstU[i]= src1[2*i + 1];
1685
        dstV[i]= src2[2*i + 1];
1686
    }
1687
#endif
1688
}
1689

    
1690
/* This is almost identical to the previous, end exists only because
1691
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1693
{
1694
#if COMPILE_TEMPLATE_MMX
1695
    __asm__ volatile(
1696
        "mov                  %0, %%"REG_a"         \n\t"
1697
        "1:                                         \n\t"
1698
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1699
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1700
        "psrlw                $8, %%mm0             \n\t"
1701
        "psrlw                $8, %%mm1             \n\t"
1702
        "packuswb          %%mm1, %%mm0             \n\t"
1703
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1704
        "add                  $8, %%"REG_a"         \n\t"
1705
        " js                  1b                    \n\t"
1706
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707
        : "%"REG_a
1708
    );
1709
#else
1710
    int i;
1711
    for (i=0; i<width; i++)
1712
        dst[i]= src[2*i+1];
1713
#endif
1714
}
1715

    
1716
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1717
{
1718
#if COMPILE_TEMPLATE_MMX
1719
    __asm__ volatile(
1720
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1721
        "mov                    %0, %%"REG_a"       \n\t"
1722
        "1:                                         \n\t"
1723
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1724
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1725
        "pand                %%mm4, %%mm0           \n\t"
1726
        "pand                %%mm4, %%mm1           \n\t"
1727
        "packuswb            %%mm1, %%mm0           \n\t"
1728
        "movq                %%mm0, %%mm1           \n\t"
1729
        "psrlw                  $8, %%mm0           \n\t"
1730
        "pand                %%mm4, %%mm1           \n\t"
1731
        "packuswb            %%mm0, %%mm0           \n\t"
1732
        "packuswb            %%mm1, %%mm1           \n\t"
1733
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1734
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1735
        "add                    $4, %%"REG_a"       \n\t"
1736
        " js                    1b                  \n\t"
1737
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738
        : "%"REG_a
1739
    );
1740
#else
1741
    int i;
1742
    for (i=0; i<width; i++) {
1743
        dstU[i]= src1[4*i + 0];
1744
        dstV[i]= src1[4*i + 2];
1745
    }
1746
#endif
1747
    assert(src1 == src2);
1748
}
1749

    
1750
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1751
{
1752
#if COMPILE_TEMPLATE_MMX
1753
    __asm__ volatile(
1754
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1755
        "mov                    %0, %%"REG_a"       \n\t"
1756
        "1:                                         \n\t"
1757
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1758
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1759
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1760
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1761
        "pand                %%mm4, %%mm0           \n\t"
1762
        "pand                %%mm4, %%mm1           \n\t"
1763
        "pand                %%mm4, %%mm2           \n\t"
1764
        "pand                %%mm4, %%mm3           \n\t"
1765
        "packuswb            %%mm1, %%mm0           \n\t"
1766
        "packuswb            %%mm3, %%mm2           \n\t"
1767
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1768
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1769
        "add                    $8, %%"REG_a"       \n\t"
1770
        " js                    1b                  \n\t"
1771
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1772
        : "%"REG_a
1773
    );
1774
#else
1775
    int i;
1776
    for (i=0; i<width; i++) {
1777
        dstU[i]= src1[2*i];
1778
        dstV[i]= src2[2*i];
1779
    }
1780
#endif
1781
}
1782

    
1783
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784
                                    const uint8_t *src, long width)
1785
{
1786
#if COMPILE_TEMPLATE_MMX
1787
    __asm__ volatile(
1788
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1789
        "mov                    %0, %%"REG_a"       \n\t"
1790
        "1:                                         \n\t"
1791
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1792
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1793
        "movq                %%mm0, %%mm2           \n\t"
1794
        "movq                %%mm1, %%mm3           \n\t"
1795
        "pand                %%mm4, %%mm0           \n\t"
1796
        "pand                %%mm4, %%mm1           \n\t"
1797
        "psrlw                  $8, %%mm2           \n\t"
1798
        "psrlw                  $8, %%mm3           \n\t"
1799
        "packuswb            %%mm1, %%mm0           \n\t"
1800
        "packuswb            %%mm3, %%mm2           \n\t"
1801
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1802
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1803
        "add                    $8, %%"REG_a"       \n\t"
1804
        " js                    1b                  \n\t"
1805
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1806
        : "%"REG_a
1807
    );
1808
#else
1809
    int i;
1810
    for (i = 0; i < width; i++) {
1811
        dst1[i] = src[2*i+0];
1812
        dst2[i] = src[2*i+1];
1813
    }
1814
#endif
1815
}
1816

    
1817
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818
                                    const uint8_t *src1, const uint8_t *src2,
1819
                                    long width, uint32_t *unused)
1820
{
1821
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822
}
1823

    
1824
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825
                                    const uint8_t *src1, const uint8_t *src2,
1826
                                    long width, uint32_t *unused)
1827
{
1828
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829
}
1830

    
1831
#if COMPILE_TEMPLATE_MMX
1832
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1833
{
1834

    
1835
    if(srcFormat == PIX_FMT_BGR24) {
1836
        __asm__ volatile(
1837
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1838
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1839
            :
1840
        );
1841
    } else {
1842
        __asm__ volatile(
1843
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1844
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1845
            :
1846
        );
1847
    }
1848

    
1849
    __asm__ volatile(
1850
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1851
        "mov                        %2, %%"REG_a"   \n\t"
1852
        "pxor                    %%mm7, %%mm7       \n\t"
1853
        "1:                                         \n\t"
1854
        PREFETCH"               64(%0)              \n\t"
1855
        "movd                     (%0), %%mm0       \n\t"
1856
        "movd                    2(%0), %%mm1       \n\t"
1857
        "movd                    6(%0), %%mm2       \n\t"
1858
        "movd                    8(%0), %%mm3       \n\t"
1859
        "add                       $12, %0          \n\t"
1860
        "punpcklbw               %%mm7, %%mm0       \n\t"
1861
        "punpcklbw               %%mm7, %%mm1       \n\t"
1862
        "punpcklbw               %%mm7, %%mm2       \n\t"
1863
        "punpcklbw               %%mm7, %%mm3       \n\t"
1864
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1865
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1866
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1867
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1868
        "paddd                   %%mm1, %%mm0       \n\t"
1869
        "paddd                   %%mm3, %%mm2       \n\t"
1870
        "paddd                   %%mm4, %%mm0       \n\t"
1871
        "paddd                   %%mm4, %%mm2       \n\t"
1872
        "psrad                     $15, %%mm0       \n\t"
1873
        "psrad                     $15, %%mm2       \n\t"
1874
        "packssdw                %%mm2, %%mm0       \n\t"
1875
        "packuswb                %%mm0, %%mm0       \n\t"
1876
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1877
        "add                        $4, %%"REG_a"   \n\t"
1878
        " js                        1b              \n\t"
1879
    : "+r" (src)
1880
    : "r" (dst+width), "g" ((x86_reg)-width)
1881
    : "%"REG_a
1882
    );
1883
}
1884

    
1885
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1886
{
1887
    __asm__ volatile(
1888
        "movq                    24+%4, %%mm6       \n\t"
1889
        "mov                        %3, %%"REG_a"   \n\t"
1890
        "pxor                    %%mm7, %%mm7       \n\t"
1891
        "1:                                         \n\t"
1892
        PREFETCH"               64(%0)              \n\t"
1893
        "movd                     (%0), %%mm0       \n\t"
1894
        "movd                    2(%0), %%mm1       \n\t"
1895
        "punpcklbw               %%mm7, %%mm0       \n\t"
1896
        "punpcklbw               %%mm7, %%mm1       \n\t"
1897
        "movq                    %%mm0, %%mm2       \n\t"
1898
        "movq                    %%mm1, %%mm3       \n\t"
1899
        "pmaddwd                    %4, %%mm0       \n\t"
1900
        "pmaddwd                  8+%4, %%mm1       \n\t"
1901
        "pmaddwd                 16+%4, %%mm2       \n\t"
1902
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1903
        "paddd                   %%mm1, %%mm0       \n\t"
1904
        "paddd                   %%mm3, %%mm2       \n\t"
1905

    
1906
        "movd                    6(%0), %%mm1       \n\t"
1907
        "movd                    8(%0), %%mm3       \n\t"
1908
        "add                       $12, %0          \n\t"
1909
        "punpcklbw               %%mm7, %%mm1       \n\t"
1910
        "punpcklbw               %%mm7, %%mm3       \n\t"
1911
        "movq                    %%mm1, %%mm4       \n\t"
1912
        "movq                    %%mm3, %%mm5       \n\t"
1913
        "pmaddwd                    %4, %%mm1       \n\t"
1914
        "pmaddwd                  8+%4, %%mm3       \n\t"
1915
        "pmaddwd                 16+%4, %%mm4       \n\t"
1916
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1917
        "paddd                   %%mm3, %%mm1       \n\t"
1918
        "paddd                   %%mm5, %%mm4       \n\t"
1919

    
1920
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1921
        "paddd                   %%mm3, %%mm0       \n\t"
1922
        "paddd                   %%mm3, %%mm2       \n\t"
1923
        "paddd                   %%mm3, %%mm1       \n\t"
1924
        "paddd                   %%mm3, %%mm4       \n\t"
1925
        "psrad                     $15, %%mm0       \n\t"
1926
        "psrad                     $15, %%mm2       \n\t"
1927
        "psrad                     $15, %%mm1       \n\t"
1928
        "psrad                     $15, %%mm4       \n\t"
1929
        "packssdw                %%mm1, %%mm0       \n\t"
1930
        "packssdw                %%mm4, %%mm2       \n\t"
1931
        "packuswb                %%mm0, %%mm0       \n\t"
1932
        "packuswb                %%mm2, %%mm2       \n\t"
1933
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1934
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1935
        "add                        $4, %%"REG_a"   \n\t"
1936
        " js                        1b              \n\t"
1937
    : "+r" (src)
1938
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1939
    : "%"REG_a
1940
    );
1941
}
1942
#endif
1943

    
1944
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1945
{
1946
#if COMPILE_TEMPLATE_MMX
1947
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1948
#else
1949
    int i;
1950
    for (i=0; i<width; i++) {
1951
        int b= src[i*3+0];
1952
        int g= src[i*3+1];
1953
        int r= src[i*3+2];
1954

    
1955
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1956
    }
1957
#endif /* COMPILE_TEMPLATE_MMX */
1958
}
1959

    
1960
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1961
{
1962
#if COMPILE_TEMPLATE_MMX
1963
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1964
#else
1965
    int i;
1966
    for (i=0; i<width; i++) {
1967
        int b= src1[3*i + 0];
1968
        int g= src1[3*i + 1];
1969
        int r= src1[3*i + 2];
1970

    
1971
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1973
    }
1974
#endif /* COMPILE_TEMPLATE_MMX */
1975
    assert(src1 == src2);
1976
}
1977

    
1978
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1979
{
1980
    int i;
1981
    for (i=0; i<width; i++) {
1982
        int b= src1[6*i + 0] + src1[6*i + 3];
1983
        int g= src1[6*i + 1] + src1[6*i + 4];
1984
        int r= src1[6*i + 2] + src1[6*i + 5];
1985

    
1986
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1988
    }
1989
    assert(src1 == src2);
1990
}
1991

    
1992
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1993
{
1994
#if COMPILE_TEMPLATE_MMX
1995
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1996
#else
1997
    int i;
1998
    for (i=0; i<width; i++) {
1999
        int r= src[i*3+0];
2000
        int g= src[i*3+1];
2001
        int b= src[i*3+2];
2002

    
2003
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2004
    }
2005
#endif
2006
}
2007

    
2008
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2009
{
2010
#if COMPILE_TEMPLATE_MMX
2011
    assert(src1==src2);
2012
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2013
#else
2014
    int i;
2015
    assert(src1==src2);
2016
    for (i=0; i<width; i++) {
2017
        int r= src1[3*i + 0];
2018
        int g= src1[3*i + 1];
2019
        int b= src1[3*i + 2];
2020

    
2021
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2023
    }
2024
#endif
2025
}
2026

    
2027
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2028
{
2029
    int i;
2030
    assert(src1==src2);
2031
    for (i=0; i<width; i++) {
2032
        int r= src1[6*i + 0] + src1[6*i + 3];
2033
        int g= src1[6*i + 1] + src1[6*i + 4];
2034
        int b= src1[6*i + 2] + src1[6*i + 5];
2035

    
2036
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2038
    }
2039
}
2040

    
2041

    
2042
// bilinear / bicubic scaling
2043
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2045
{
2046
#if COMPILE_TEMPLATE_MMX
2047
    assert(filterSize % 4 == 0 && filterSize>0);
2048
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2049
        x86_reg counter= -2*dstW;
2050
        filter-= counter*2;
2051
        filterPos-= counter/2;
2052
        dst-= counter/2;
2053
        __asm__ volatile(
2054
#if defined(PIC)
2055
            "push            %%"REG_b"              \n\t"
2056
#endif
2057
            "pxor                %%mm7, %%mm7       \n\t"
2058
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2059
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2060
            ASMALIGN(4)
2061
            "1:                                     \n\t"
2062
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2063
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2064
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2065
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2066
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2067
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2068
            "punpcklbw           %%mm7, %%mm0       \n\t"
2069
            "punpcklbw           %%mm7, %%mm2       \n\t"
2070
            "pmaddwd             %%mm1, %%mm0       \n\t"
2071
            "pmaddwd             %%mm2, %%mm3       \n\t"
2072
            "movq                %%mm0, %%mm4       \n\t"
2073
            "punpckldq           %%mm3, %%mm0       \n\t"
2074
            "punpckhdq           %%mm3, %%mm4       \n\t"
2075
            "paddd               %%mm4, %%mm0       \n\t"
2076
            "psrad                  $7, %%mm0       \n\t"
2077
            "packssdw            %%mm0, %%mm0       \n\t"
2078
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2079
            "add                    $4, %%"REG_BP"  \n\t"
2080
            " jnc                   1b              \n\t"
2081

    
2082
            "pop            %%"REG_BP"              \n\t"
2083
#if defined(PIC)
2084
            "pop             %%"REG_b"              \n\t"
2085
#endif
2086
            : "+a" (counter)
2087
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2088
#if !defined(PIC)
2089
            : "%"REG_b
2090
#endif
2091
        );
2092
    } else if (filterSize==8) {
2093
        x86_reg counter= -2*dstW;
2094
        filter-= counter*4;
2095
        filterPos-= counter/2;
2096
        dst-= counter/2;
2097
        __asm__ volatile(
2098
#if defined(PIC)
2099
            "push             %%"REG_b"             \n\t"
2100
#endif
2101
            "pxor                 %%mm7, %%mm7      \n\t"
2102
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2103
            "mov              %%"REG_a", %%"REG_BP" \n\t"
2104
            ASMALIGN(4)
2105
            "1:                                     \n\t"
2106
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2107
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2108
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2109
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2110
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2111
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2112
            "punpcklbw            %%mm7, %%mm0      \n\t"
2113
            "punpcklbw            %%mm7, %%mm2      \n\t"
2114
            "pmaddwd              %%mm1, %%mm0      \n\t"
2115
            "pmaddwd              %%mm2, %%mm3      \n\t"
2116

    
2117
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2118
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2119
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2120
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2121
            "punpcklbw            %%mm7, %%mm4      \n\t"
2122
            "punpcklbw            %%mm7, %%mm2      \n\t"
2123
            "pmaddwd              %%mm1, %%mm4      \n\t"
2124
            "pmaddwd              %%mm2, %%mm5      \n\t"
2125
            "paddd                %%mm4, %%mm0      \n\t"
2126
            "paddd                %%mm5, %%mm3      \n\t"
2127
            "movq                 %%mm0, %%mm4      \n\t"
2128
            "punpckldq            %%mm3, %%mm0      \n\t"
2129
            "punpckhdq            %%mm3, %%mm4      \n\t"
2130
            "paddd                %%mm4, %%mm0      \n\t"
2131
            "psrad                   $7, %%mm0      \n\t"
2132
            "packssdw             %%mm0, %%mm0      \n\t"
2133
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2134
            "add                     $4, %%"REG_BP" \n\t"
2135
            " jnc                    1b             \n\t"
2136

    
2137
            "pop             %%"REG_BP"             \n\t"
2138
#if defined(PIC)
2139
            "pop              %%"REG_b"             \n\t"
2140
#endif
2141
            : "+a" (counter)
2142
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2143
#if !defined(PIC)
2144
            : "%"REG_b
2145
#endif
2146
        );
2147
    } else {
2148
        const uint8_t *offset = src+filterSize;
2149
        x86_reg counter= -2*dstW;
2150
        //filter-= counter*filterSize/2;
2151
        filterPos-= counter/2;
2152
        dst-= counter/2;
2153
        __asm__ volatile(
2154
            "pxor                  %%mm7, %%mm7     \n\t"
2155
            ASMALIGN(4)
2156
            "1:                                     \n\t"
2157
            "mov                      %2, %%"REG_c" \n\t"
2158
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2159
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2160
            "mov                      %5, %%"REG_c" \n\t"
2161
            "pxor                  %%mm4, %%mm4     \n\t"
2162
            "pxor                  %%mm5, %%mm5     \n\t"
2163
            "2:                                     \n\t"
2164
            "movq                   (%1), %%mm1     \n\t"
2165
            "movq               (%1, %6), %%mm3     \n\t"
2166
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2167
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2168
            "punpcklbw             %%mm7, %%mm0     \n\t"
2169
            "punpcklbw             %%mm7, %%mm2     \n\t"
2170
            "pmaddwd               %%mm1, %%mm0     \n\t"
2171
            "pmaddwd               %%mm2, %%mm3     \n\t"
2172
            "paddd                 %%mm3, %%mm5     \n\t"
2173
            "paddd                 %%mm0, %%mm4     \n\t"
2174
            "add                      $8, %1        \n\t"
2175
            "add                      $4, %%"REG_c" \n\t"
2176
            "cmp                      %4, %%"REG_c" \n\t"
2177
            " jb                      2b            \n\t"
2178
            "add                      %6, %1        \n\t"
2179
            "movq                  %%mm4, %%mm0     \n\t"
2180
            "punpckldq             %%mm5, %%mm4     \n\t"
2181
            "punpckhdq             %%mm5, %%mm0     \n\t"
2182
            "paddd                 %%mm0, %%mm4     \n\t"
2183
            "psrad                    $7, %%mm4     \n\t"
2184
            "packssdw              %%mm4, %%mm4     \n\t"
2185
            "mov                      %3, %%"REG_a" \n\t"
2186
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2187
            "add                      $4, %0        \n\t"
2188
            " jnc                     1b            \n\t"
2189

    
2190
            : "+r" (counter), "+r" (filter)
2191
            : "m" (filterPos), "m" (dst), "m"(offset),
2192
            "m" (src), "r" ((x86_reg)filterSize*2)
2193
            : "%"REG_a, "%"REG_c, "%"REG_d
2194
        );
2195
    }
2196
#else
2197
#if COMPILE_TEMPLATE_ALTIVEC
2198
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2199
#else
2200
    int i;
2201
    for (i=0; i<dstW; i++) {
2202
        int j;
2203
        int srcPos= filterPos[i];
2204
        int val=0;
2205
        //printf("filterPos: %d\n", filterPos[i]);
2206
        for (j=0; j<filterSize; j++) {
2207
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2209
        }
2210
        //filter += hFilterSize;
2211
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2212
        //dst[i] = val>>7;
2213
    }
2214
#endif /* COMPILE_ALTIVEC */
2215
#endif /* COMPILE_MMX */
2216
}
2217

    
2218
//FIXME all pal and rgb srcFormats could do this convertion as well
2219
//FIXME all scalers more complex than bilinear could do half of this transform
2220
static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2221
{
2222
    int i;
2223
    for (i = 0; i < width; i++) {
2224
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2225
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2226
    }
2227
}
2228
static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2229
{
2230
    int i;
2231
    for (i = 0; i < width; i++) {
2232
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2233
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2234
    }
2235
}
2236
static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2237
{
2238
    int i;
2239
    for (i = 0; i < width; i++)
2240
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2241
}
2242
static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2243
{
2244
    int i;
2245
    for (i = 0; i < width; i++)
2246
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2247
}
2248

    
2249
#define FAST_BILINEAR_X86 \
2250
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2251
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2252
    "shll      $16, %%edi    \n\t"                                              \
2253
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2254
    "mov        %1, %%"REG_D"\n\t"                                              \
2255
    "shrl       $9, %%esi    \n\t"                                              \
2256

    
2257
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258
                                        long dstWidth, const uint8_t *src, int srcW,
2259
                                        int xInc)
2260
{
2261
#if ARCH_X86 && CONFIG_GPL
2262
#if COMPILE_TEMPLATE_MMX2
2263
    int32_t *filterPos = c->hLumFilterPos;
2264
    int16_t *filter    = c->hLumFilter;
2265
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2266
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2267
    int i;
2268
#if defined(PIC)
2269
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270
#endif
2271
    if (canMMX2BeUsed) {
2272
        __asm__ volatile(
2273
#if defined(PIC)
2274
            "mov               %%"REG_b", %5        \n\t"
2275
#endif
2276
            "pxor                  %%mm7, %%mm7     \n\t"
2277
            "mov                      %0, %%"REG_c" \n\t"
2278
            "mov                      %1, %%"REG_D" \n\t"
2279
            "mov                      %2, %%"REG_d" \n\t"
2280
            "mov                      %3, %%"REG_b" \n\t"
2281
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2282
            PREFETCH"        (%%"REG_c")            \n\t"
2283
            PREFETCH"      32(%%"REG_c")            \n\t"
2284
            PREFETCH"      64(%%"REG_c")            \n\t"
2285

    
2286
#if ARCH_X86_64
2287

    
2288
#define CALL_MMX2_FILTER_CODE \
2289
            "movl            (%%"REG_b"), %%esi     \n\t"\
2290
            "call                    *%4            \n\t"\
2291
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2292
            "add               %%"REG_S", %%"REG_c" \n\t"\
2293
            "add               %%"REG_a", %%"REG_D" \n\t"\
2294
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2295

    
2296
#else
2297

    
2298
#define CALL_MMX2_FILTER_CODE \
2299
            "movl (%%"REG_b"), %%esi        \n\t"\
2300
            "call         *%4                       \n\t"\
2301
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302
            "add               %%"REG_a", %%"REG_D" \n\t"\
2303
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2304

    
2305
#endif /* ARCH_X86_64 */
2306

    
2307
            CALL_MMX2_FILTER_CODE
2308
            CALL_MMX2_FILTER_CODE
2309
            CALL_MMX2_FILTER_CODE
2310
            CALL_MMX2_FILTER_CODE
2311
            CALL_MMX2_FILTER_CODE
2312
            CALL_MMX2_FILTER_CODE
2313
            CALL_MMX2_FILTER_CODE
2314
            CALL_MMX2_FILTER_CODE
2315

    
2316
#if defined(PIC)
2317
            "mov                      %5, %%"REG_b" \n\t"
2318
#endif
2319
            :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
2320
            "m" (mmx2FilterCode)
2321
#if defined(PIC)
2322
            ,"m" (ebxsave)
2323
#endif
2324
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2325
#if !defined(PIC)
2326
            ,"%"REG_b
2327
#endif
2328
        );
2329
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2330
    } else {
2331
#endif /* COMPILE_TEMPLATE_MMX2 */
2332
    x86_reg xInc_shr16 = xInc >> 16;
2333
    uint16_t xInc_mask = xInc & 0xffff;
2334
    //NO MMX just normal asm ...
2335
    __asm__ volatile(
2336
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2337
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2338
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2339
        ASMALIGN(4)
2340
        "1:                                  \n\t"
2341
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2342
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2343
        FAST_BILINEAR_X86
2344
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2345
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2346
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2347

    
2348
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2349
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2350
        FAST_BILINEAR_X86
2351
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2352
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2353
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2354

    
2355

    
2356
        "add        $2, %%"REG_a"            \n\t"
2357
        "cmp        %2, %%"REG_a"            \n\t"
2358
        " jb        1b                       \n\t"
2359

    
2360

    
2361
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2363
    );
2364
#if COMPILE_TEMPLATE_MMX2
2365
    } //if MMX2 can't be used
2366
#endif
2367
#else
2368
    int i;
2369
    unsigned int xpos=0;
2370
    for (i=0;i<dstWidth;i++) {
2371
        register unsigned int xx=xpos>>16;
2372
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2374
        xpos+=xInc;
2375
    }
2376
#endif /* ARCH_X86 */
2377
}
2378

    
2379
      // *** horizontal scale Y line to temp buffer
2380
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2381
                                   const int16_t *hLumFilter,
2382
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2383
                                   uint8_t *formatConvBuffer,
2384
                                   uint32_t *pal, int isAlpha)
2385
{
2386
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2387
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2388

    
2389
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2390

    
2391
    if (toYV12) {
2392
        toYV12(formatConvBuffer, src, srcW, pal);
2393
        src= formatConvBuffer;
2394
    }
2395

    
2396
    if (!c->hyscale_fast) {
2397
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2398
    } else { // fast bilinear upscale / crap downscale
2399
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2400
    }
2401

    
2402
    if (convertRange)
2403
        convertRange(dst, dstWidth);
2404
}
2405

    
2406
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2407
                                        long dstWidth, const uint8_t *src1,
2408
                                        const uint8_t *src2, int srcW, int xInc)
2409
{
2410
#if ARCH_X86 && CONFIG_GPL
2411
#if COMPILE_TEMPLATE_MMX2
2412
    int32_t *filterPos = c->hChrFilterPos;
2413
    int16_t *filter    = c->hChrFilter;
2414
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2415
    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2416
    int i;
2417
#if defined(PIC)
2418
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2419
#endif
2420
    if (canMMX2BeUsed) {
2421
        __asm__ volatile(
2422
#if defined(PIC)
2423
            "mov          %%"REG_b", %6         \n\t"
2424
#endif
2425
            "pxor             %%mm7, %%mm7      \n\t"
2426
            "mov                 %0, %%"REG_c"  \n\t"
2427
            "mov                 %1, %%"REG_D"  \n\t"
2428
            "mov                 %2, %%"REG_d"  \n\t"
2429
            "mov                 %3, %%"REG_b"  \n\t"
2430
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2431
            PREFETCH"   (%%"REG_c")             \n\t"
2432
            PREFETCH" 32(%%"REG_c")             \n\t"
2433
            PREFETCH" 64(%%"REG_c")             \n\t"
2434

    
2435
            CALL_MMX2_FILTER_CODE
2436
            CALL_MMX2_FILTER_CODE
2437
            CALL_MMX2_FILTER_CODE
2438
            CALL_MMX2_FILTER_CODE
2439
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2440
            "mov                 %5, %%"REG_c"  \n\t" // src
2441
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2442
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2443
            PREFETCH"   (%%"REG_c")             \n\t"
2444
            PREFETCH" 32(%%"REG_c")             \n\t"
2445
            PREFETCH" 64(%%"REG_c")             \n\t"
2446

    
2447
            CALL_MMX2_FILTER_CODE
2448
            CALL_MMX2_FILTER_CODE
2449
            CALL_MMX2_FILTER_CODE
2450
            CALL_MMX2_FILTER_CODE
2451

    
2452
#if defined(PIC)
2453
            "mov %6, %%"REG_b"    \n\t"
2454
#endif
2455
            :: "m" (src1), "m" (dst), "m" (filter), "m" (filterPos),
2456
            "m" (mmx2FilterCode), "m" (src2)
2457
#if defined(PIC)
2458
            ,"m" (ebxsave)
2459
#endif
2460
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2461
#if !defined(PIC)
2462
            ,"%"REG_b
2463
#endif
2464
        );
2465
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2466
            //printf("%d %d %d\n", dstWidth, i, srcW);
2467
            dst[i] = src1[srcW-1]*128;
2468
            dst[i+VOFW] = src2[srcW-1]*128;
2469
        }
2470
    } else {
2471
#endif /* COMPILE_TEMPLATE_MMX2 */
2472
        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2473
        uint16_t xInc_mask = xInc & 0xffff;
2474
        __asm__ volatile(
2475
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2476
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2477
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2478
            ASMALIGN(4)
2479
            "1:                                     \n\t"
2480
            "mov        %0, %%"REG_S"               \n\t"
2481
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2482
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2483
            FAST_BILINEAR_X86
2484
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2485

    
2486
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2487
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2488
            FAST_BILINEAR_X86
2489
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2490

    
2491
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2492
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2493
            "add        $1, %%"REG_a"               \n\t"
2494
            "cmp        %2, %%"REG_a"               \n\t"
2495
            " jb        1b                          \n\t"
2496

    
2497
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2498
which is needed to support GCC 4.0. */
2499
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2500
            :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2501
#else
2502
            :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2503
#endif
2504
            "r" (src2)
2505
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2506
        );
2507
#if COMPILE_TEMPLATE_MMX2
2508
    } //if MMX2 can't be used
2509
#endif
2510
#else
2511
    int i;
2512
    unsigned int xpos=0;
2513
    for (i=0;i<dstWidth;i++) {
2514
        register unsigned int xx=xpos>>16;
2515
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2516
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2517
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2518
        /* slower
2519
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2520
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2521
        */
2522
        xpos+=xInc;
2523
    }
2524
#endif /* ARCH_X86 */
2525
}
2526

    
2527
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2528
                                   int srcW, int xInc, const int16_t *hChrFilter,
2529
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2530
                                   uint8_t *formatConvBuffer,
2531
                                   uint32_t *pal)
2532
{
2533

    
2534
    src1 += c->chrSrcOffset;
2535
    src2 += c->chrSrcOffset;
2536

    
2537
    if (c->chrToYV12) {
2538
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2539
        src1= formatConvBuffer;
2540
        src2= formatConvBuffer+VOFW;
2541
    }
2542

    
2543
    if (!c->hcscale_fast) {
2544
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2545
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2546
    } else { // fast bilinear upscale / crap downscale
2547
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2548
    }
2549

    
2550
    if (c->chrConvertRange)
2551
        c->chrConvertRange(dst, dstWidth);
2552
}
2553

    
2554
#define DEBUG_SWSCALE_BUFFERS 0
2555
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2556

    
2557
static int RENAME(swScale)(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
2558
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2559
{
2560
    /* load a few things into local vars to make the code more readable? and faster */
2561
    const int srcW= c->srcW;
2562
    const int dstW= c->dstW;
2563
    const int dstH= c->dstH;
2564
    const int chrDstW= c->chrDstW;
2565
    const int chrSrcW= c->chrSrcW;
2566
    const int lumXInc= c->lumXInc;
2567
    const int chrXInc= c->chrXInc;
2568
    const enum PixelFormat dstFormat= c->dstFormat;
2569
    const int flags= c->flags;
2570
    int16_t *vLumFilterPos= c->vLumFilterPos;
2571
    int16_t *vChrFilterPos= c->vChrFilterPos;
2572
    int16_t *hLumFilterPos= c->hLumFilterPos;
2573
    int16_t *hChrFilterPos= c->hChrFilterPos;
2574
    int16_t *vLumFilter= c->vLumFilter;
2575
    int16_t *vChrFilter= c->vChrFilter;
2576
    int16_t *hLumFilter= c->hLumFilter;
2577
    int16_t *hChrFilter= c->hChrFilter;
2578
    int32_t *lumMmxFilter= c->lumMmxFilter;
2579
    int32_t *chrMmxFilter= c->chrMmxFilter;
2580
    int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2581
    const int vLumFilterSize= c->vLumFilterSize;
2582
    const int vChrFilterSize= c->vChrFilterSize;
2583
    const int hLumFilterSize= c->hLumFilterSize;
2584
    const int hChrFilterSize= c->hChrFilterSize;
2585
    int16_t **lumPixBuf= c->lumPixBuf;
2586
    int16_t **chrPixBuf= c->chrPixBuf;
2587
    int16_t **alpPixBuf= c->alpPixBuf;
2588
    const int vLumBufSize= c->vLumBufSize;
2589
    const int vChrBufSize= c->vChrBufSize;
2590
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2591
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2592
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2593
    int lastDstY;
2594
    uint32_t *pal=c->pal_yuv;
2595

    
2596
    /* vars which will change and which we need to store back in the context */
2597
    int dstY= c->dstY;
2598
    int lumBufIndex= c->lumBufIndex;
2599
    int chrBufIndex= c->chrBufIndex;
2600
    int lastInLumBuf= c->lastInLumBuf;
2601
    int lastInChrBuf= c->lastInChrBuf;
2602

    
2603
    if (isPacked(c->srcFormat)) {
2604
        src[0]=
2605
        src[1]=
2606
        src[2]=
2607
        src[3]= src[0];
2608
        srcStride[0]=
2609
        srcStride[1]=
2610
        srcStride[2]=
2611
        srcStride[3]= srcStride[0];
2612
    }
2613
    srcStride[1]<<= c->vChrDrop;
2614
    srcStride[2]<<= c->vChrDrop;
2615

    
2616
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2617
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2618
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2619
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2620
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2621
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2622
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2623

    
2624
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2625
        static int warnedAlready=0; //FIXME move this into the context perhaps
2626
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2627
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2628
                   "         ->cannot do aligned memory accesses anymore\n");
2629
            warnedAlready=1;
2630
        }
2631
    }
2632

    
2633
    /* Note the user might start scaling the picture in the middle so this
2634
       will not get executed. This is not really intended but works
2635
       currently, so people might do it. */
2636
    if (srcSliceY ==0) {
2637
        lumBufIndex=-1;
2638
        chrBufIndex=-1;
2639
        dstY=0;
2640
        lastInLumBuf= -1;
2641
        lastInChrBuf= -1;
2642
    }
2643

    
2644
    lastDstY= dstY;
2645

    
2646
    for (;dstY < dstH; dstY++) {
2647
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2648
        const int chrDstY= dstY>>c->chrDstVSubSample;
2649
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2650
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2651
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2652

    
2653
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2654
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2655
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2656
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2657
        int enough_lines;
2658

    
2659
        //handle holes (FAST_BILINEAR & weird filters)
2660
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2661
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2662
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2663
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2664

    
2665
        // Do we have enough lines in this slice to output the dstY line
2666
        enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2667
        if (!enough_lines) {
2668
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2669
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2670
        }
2671

    
2672
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2673
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2674
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2675
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2676
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2677

    
2678
        //Do horizontal scaling
2679
        while(lastInLumBuf < lastLumSrcY) {
2680
            const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2681
            const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2682
            lumBufIndex++;
2683
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2684
                               lumBufIndex,    lastInLumBuf);
2685
            assert(lumBufIndex < 2*vLumBufSize);
2686
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2687
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2688
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2689
                            hLumFilter, hLumFilterPos, hLumFilterSize,
2690
                            formatConvBuffer,
2691
                            pal, 0);
2692
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2693
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2694
                                hLumFilter, hLumFilterPos, hLumFilterSize,
2695
                                formatConvBuffer,
2696
                                pal, 1);
2697
            lastInLumBuf++;
2698
        }
2699
        while(lastInChrBuf < lastChrSrcY) {
2700
            const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2701
            const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2702
            chrBufIndex++;
2703
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2704
                               chrBufIndex,    lastInChrBuf);
2705
            assert(chrBufIndex < 2*vChrBufSize);
2706
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2707
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2708
            //FIXME replace parameters through context struct (some at least)
2709

    
2710
            if (c->needs_hcscale)
2711
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2712
                                hChrFilter, hChrFilterPos, hChrFilterSize,
2713
                                formatConvBuffer,
2714
                                pal);
2715
            lastInChrBuf++;
2716
        }
2717
        //wrap buf index around to stay inside the ring buffer
2718
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2719
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2720
        if (!enough_lines)
2721
            break; //we can't output a dstY line so let's try with the next slice
2722

    
2723
#if COMPILE_TEMPLATE_MMX
2724
        c->blueDither= ff_dither8[dstY&1];
2725
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2726
            c->greenDither= ff_dither8[dstY&1];
2727
        else
2728
            c->greenDither= ff_dither4[dstY&1];
2729
        c->redDither= ff_dither8[(dstY+1)&1];
2730
#endif
2731
        if (dstY < dstH-2) {
2732
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2733
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2734
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2735
#if COMPILE_TEMPLATE_MMX
2736
            int i;
2737
            if (flags & SWS_ACCURATE_RND) {
2738
                int s= APCK_SIZE / 8;
2739
                for (i=0; i<vLumFilterSize; i+=2) {
2740
                    *(const void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2741
                    *(const void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2742
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2743
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2744
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2745
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2746
                        *(const void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2747
                        *(const void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2748
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2749
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2750
                    }
2751
                }
2752
                for (i=0; i<vChrFilterSize; i+=2) {
2753
                    *(const void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2754
                    *(const void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2755
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2756
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2757
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2758
                }
2759
            } else {
2760
                for (i=0; i<vLumFilterSize; i++) {
2761
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2762
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2763
                    lumMmxFilter[4*i+2]=
2764
                    lumMmxFilter[4*i+3]=
2765
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2766
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2767
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2768
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2769
                        alpMmxFilter[4*i+2]=
2770
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2771
                    }
2772
                }
2773
                for (i=0; i<vChrFilterSize; i++) {
2774
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2775
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2776
                    chrMmxFilter[4*i+2]=
2777
                    chrMmxFilter[4*i+3]=
2778
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2779
                }
2780
            }
2781
#endif
2782
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2783
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2784
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2785
                c->yuv2nv12X(c,
2786
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2787
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2788
                             dest, uDest, dstW, chrDstW, dstFormat);
2789
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2790
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2791
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2792
                if (is16BPS(dstFormat)) {
2793
                    yuv2yuvX16inC(
2794
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2795
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2796
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2797
                                  dstFormat);
2798
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2799
                    const int16_t *lumBuf = lumSrcPtr[0];
2800
                    const int16_t *chrBuf= chrSrcPtr[0];
2801
                    const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2802
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2803
                } else { //General YV12
2804
                    c->yuv2yuvX(c,
2805
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2806
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2807
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2808
                }
2809
            } else {
2810
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2811
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2812
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2813
                    int chrAlpha= vChrFilter[2*dstY+1];
2814
                    if(flags & SWS_FULL_CHR_H_INT) {
2815
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2816
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2817
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2818
                                         alpSrcPtr, dest, dstW, dstY);
2819
                    } else {
2820
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2821
                                       alpPixBuf ? *alpSrcPtr : NULL,
2822
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2823
                    }
2824
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2825
                    int lumAlpha= vLumFilter[2*dstY+1];
2826
                    int chrAlpha= vChrFilter[2*dstY+1];
2827
                    lumMmxFilter[2]=
2828
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2829
                    chrMmxFilter[2]=
2830
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2831
                    if(flags & SWS_FULL_CHR_H_INT) {
2832
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2833
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2834
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2835
                                         alpSrcPtr, dest, dstW, dstY);
2836
                    } else {
2837
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2838
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2839
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2840
                    }
2841
                } else { //general RGB
2842
                    if(flags & SWS_FULL_CHR_H_INT) {
2843
                        yuv2rgbXinC_full(c,
2844
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2845
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2846
                                         alpSrcPtr, dest, dstW, dstY);
2847
                    } else {
2848
                        c->yuv2packedX(c,
2849
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2850
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2851
                                       alpSrcPtr, dest, dstW, dstY);
2852
                    }
2853
                }
2854
            }
2855
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2856
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2857
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2858
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2859
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2860
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2861
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2862
                yuv2nv12XinC(
2863
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2864
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2865
                             dest, uDest, dstW, chrDstW, dstFormat);
2866
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2867
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2868
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2869
                if (is16BPS(dstFormat)) {
2870
                    yuv2yuvX16inC(
2871
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2872
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2873
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2874
                                  dstFormat);
2875
                } else {
2876
                    yuv2yuvXinC(
2877
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2878
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2880
                }
2881
            } else {
2882
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2883
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2884
                if(flags & SWS_FULL_CHR_H_INT) {
2885
                    yuv2rgbXinC_full(c,
2886
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2887
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2888
                                     alpSrcPtr, dest, dstW, dstY);
2889
                } else {
2890
                    yuv2packedXinC(c,
2891
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2892
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2893
                                   alpSrcPtr, dest, dstW, dstY);
2894
                }
2895
            }
2896
        }
2897
    }
2898

    
2899
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2900
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2901

    
2902
#if COMPILE_TEMPLATE_MMX
2903
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2904
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2905
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2906
    else                             __asm__ volatile("emms"  :::"memory");
2907
#endif
2908
    /* store changed local vars back in the context */
2909
    c->dstY= dstY;
2910
    c->lumBufIndex= lumBufIndex;
2911
    c->chrBufIndex= chrBufIndex;
2912
    c->lastInLumBuf= lastInLumBuf;
2913
    c->lastInChrBuf= lastInChrBuf;
2914

    
2915
    return dstY - lastDstY;
2916
}
2917

    
2918
static void RENAME(sws_init_swScale)(SwsContext *c)
2919
{
2920
    enum PixelFormat srcFormat = c->srcFormat;
2921

    
2922
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2923
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2924
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2925
    c->yuv2packed1  = RENAME(yuv2packed1 );
2926
    c->yuv2packed2  = RENAME(yuv2packed2 );
2927
    c->yuv2packedX  = RENAME(yuv2packedX );
2928

    
2929
    c->hScale       = RENAME(hScale      );
2930

    
2931
#if COMPILE_TEMPLATE_MMX
2932
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2933
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2934
#else
2935
    if (c->flags & SWS_FAST_BILINEAR)
2936
#endif
2937
    {
2938
        c->hyscale_fast = RENAME(hyscale_fast);
2939
        c->hcscale_fast = RENAME(hcscale_fast);
2940
    }
2941

    
2942
    c->chrToYV12 = NULL;
2943
    switch(srcFormat) {
2944
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2945
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2946
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2947
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2948
        case PIX_FMT_RGB8     :
2949
        case PIX_FMT_BGR8     :
2950
        case PIX_FMT_PAL8     :
2951
        case PIX_FMT_BGR4_BYTE:
2952
        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2953
        case PIX_FMT_YUV420P16BE:
2954
        case PIX_FMT_YUV422P16BE:
2955
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2956
        case PIX_FMT_YUV420P16LE:
2957
        case PIX_FMT_YUV422P16LE:
2958
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2959
    }
2960
    if (c->chrSrcHSubSample) {
2961
        switch(srcFormat) {
2962
        case PIX_FMT_RGB48BE:
2963
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2964
        case PIX_FMT_RGB32  :
2965
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2966
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2967
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2968
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2969
        case PIX_FMT_BGR32  :
2970
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2971
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2972
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2973
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2974
        }
2975
    } else {
2976
        switch(srcFormat) {
2977
        case PIX_FMT_RGB48BE:
2978
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2979
        case PIX_FMT_RGB32  :
2980
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2981
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2982
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2983
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2984
        case PIX_FMT_BGR32  :
2985
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2986
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
2987
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2988
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2989
        }
2990
    }
2991

    
2992
    c->lumToYV12 = NULL;
2993
    c->alpToYV12 = NULL;
2994
    switch (srcFormat) {
2995
    case PIX_FMT_YUYV422  :
2996
    case PIX_FMT_YUV420P16BE:
2997
    case PIX_FMT_YUV422P16BE:
2998
    case PIX_FMT_YUV444P16BE:
2999
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3000
    case PIX_FMT_UYVY422  :
3001
    case PIX_FMT_YUV420P16LE:
3002
    case PIX_FMT_YUV422P16LE:
3003
    case PIX_FMT_YUV444P16LE:
3004
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3005
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
3006
    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
3007
    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
3008
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
3009
    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
3010
    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
3011
    case PIX_FMT_RGB8     :
3012
    case PIX_FMT_BGR8     :
3013
    case PIX_FMT_PAL8     :
3014
    case PIX_FMT_BGR4_BYTE:
3015
    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3016
    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3017
    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3018
    case PIX_FMT_RGB32  :
3019
    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
3020
    case PIX_FMT_BGR32  :
3021
    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
3022
    case PIX_FMT_RGB48BE:
3023
    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3024
    }
3025
    if (c->alpPixBuf) {
3026
        switch (srcFormat) {
3027
        case PIX_FMT_RGB32  :
3028
        case PIX_FMT_RGB32_1:
3029
        case PIX_FMT_BGR32  :
3030
        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3031
        }
3032
    }
3033

    
3034
    switch (srcFormat) {
3035
    case PIX_FMT_RGB32  :
3036
    case PIX_FMT_BGR32  :
3037
        c->alpSrcOffset = 3;
3038
        break;
3039
    case PIX_FMT_RGB32_1:
3040
    case PIX_FMT_BGR32_1:
3041
        c->lumSrcOffset = ALT32_CORR;
3042
        c->chrSrcOffset = ALT32_CORR;
3043
        break;
3044
    case PIX_FMT_RGB48LE:
3045
        c->lumSrcOffset = 1;
3046
        c->chrSrcOffset = 1;
3047
        c->alpSrcOffset = 1;
3048
        break;
3049
    }
3050

    
3051
    if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
3052
        if (c->srcRange) {
3053
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
3054
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
3055
        } else {
3056
            c->lumConvertRange = RENAME(lumRangeToJpeg);
3057
            c->chrConvertRange = RENAME(chrRangeToJpeg);
3058
        }
3059
    }
3060

    
3061
    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3062
          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3063
        c->needs_hcscale = 1;
3064
}