Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 4b190455

History | View | Annotate | Download (137 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28

    
29
#if COMPILE_TEMPLATE_AMD3DNOW
30
#define PREFETCH  "prefetch"
31
#elif COMPILE_TEMPLATE_MMX2
32
#define PREFETCH "prefetchnta"
33
#else
34
#define PREFETCH  " # nop"
35
#endif
36

    
37
#if COMPILE_TEMPLATE_MMX2
38
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
39
#elif COMPILE_TEMPLATE_AMD3DNOW
40
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
41
#endif
42

    
43
#if COMPILE_TEMPLATE_MMX2
44
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
45
#else
46
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
47
#endif
48
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
49

    
50
#if COMPILE_TEMPLATE_ALTIVEC
51
#include "ppc/swscale_altivec_template.c"
52
#endif
53

    
54
#define YSCALEYUV2YV12X(x, offset, dest, width) \
55
    __asm__ volatile(\
56
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
57
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
58
        "movq                             %%mm3, %%mm4      \n\t"\
59
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
60
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
61
        ASMALIGN(4) /* FIXME Unroll? */\
62
        "1:                                                 \n\t"\
63
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
64
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
65
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
66
        "add                                $16, %%"REG_d"  \n\t"\
67
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
68
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
69
        "pmulhw                           %%mm0, %%mm2      \n\t"\
70
        "pmulhw                           %%mm0, %%mm5      \n\t"\
71
        "paddw                            %%mm2, %%mm3      \n\t"\
72
        "paddw                            %%mm5, %%mm4      \n\t"\
73
        " jnz                                1b             \n\t"\
74
        "psraw                               $3, %%mm3      \n\t"\
75
        "psraw                               $3, %%mm4      \n\t"\
76
        "packuswb                         %%mm4, %%mm3      \n\t"\
77
        MOVNTQ(%%mm3, (%1, %%REGa))\
78
        "add                                 $8, %%"REG_a"  \n\t"\
79
        "cmp                                 %2, %%"REG_a"  \n\t"\
80
        "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
81
        "movq                             %%mm3, %%mm4      \n\t"\
82
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
83
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
84
        "jb                                  1b             \n\t"\
85
        :: "r" (&c->redDither),\
86
        "r" (dest), "g" (width)\
87
        : "%"REG_a, "%"REG_d, "%"REG_S\
88
    );
89

    
90
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
91
    __asm__ volatile(\
92
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
93
        "xor                          %%"REG_a", %%"REG_a"  \n\t"\
94
        "pxor                             %%mm4, %%mm4      \n\t"\
95
        "pxor                             %%mm5, %%mm5      \n\t"\
96
        "pxor                             %%mm6, %%mm6      \n\t"\
97
        "pxor                             %%mm7, %%mm7      \n\t"\
98
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
99
        ASMALIGN(4) \
100
        "1:                                                 \n\t"\
101
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
102
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
103
        "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
104
        "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
105
        "movq                             %%mm0, %%mm3      \n\t"\
106
        "punpcklwd                        %%mm1, %%mm0      \n\t"\
107
        "punpckhwd                        %%mm1, %%mm3      \n\t"\
108
        "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
109
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
110
        "pmaddwd                          %%mm1, %%mm3      \n\t"\
111
        "paddd                            %%mm0, %%mm4      \n\t"\
112
        "paddd                            %%mm3, %%mm5      \n\t"\
113
        "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
114
        "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
115
        "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
116
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
117
        "movq                             %%mm2, %%mm0      \n\t"\
118
        "punpcklwd                        %%mm3, %%mm2      \n\t"\
119
        "punpckhwd                        %%mm3, %%mm0      \n\t"\
120
        "pmaddwd                          %%mm1, %%mm2      \n\t"\
121
        "pmaddwd                          %%mm1, %%mm0      \n\t"\
122
        "paddd                            %%mm2, %%mm6      \n\t"\
123
        "paddd                            %%mm0, %%mm7      \n\t"\
124
        " jnz                                1b             \n\t"\
125
        "psrad                              $16, %%mm4      \n\t"\
126
        "psrad                              $16, %%mm5      \n\t"\
127
        "psrad                              $16, %%mm6      \n\t"\
128
        "psrad                              $16, %%mm7      \n\t"\
129
        "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
130
        "packssdw                         %%mm5, %%mm4      \n\t"\
131
        "packssdw                         %%mm7, %%mm6      \n\t"\
132
        "paddw                            %%mm0, %%mm4      \n\t"\
133
        "paddw                            %%mm0, %%mm6      \n\t"\
134
        "psraw                               $3, %%mm4      \n\t"\
135
        "psraw                               $3, %%mm6      \n\t"\
136
        "packuswb                         %%mm6, %%mm4      \n\t"\
137
        MOVNTQ(%%mm4, (%1, %%REGa))\
138
        "add                                 $8, %%"REG_a"  \n\t"\
139
        "cmp                                 %2, %%"REG_a"  \n\t"\
140
        "lea                     " offset "(%0), %%"REG_d"  \n\t"\
141
        "pxor                             %%mm4, %%mm4      \n\t"\
142
        "pxor                             %%mm5, %%mm5      \n\t"\
143
        "pxor                             %%mm6, %%mm6      \n\t"\
144
        "pxor                             %%mm7, %%mm7      \n\t"\
145
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
146
        "jb                                  1b             \n\t"\
147
        :: "r" (&c->redDither),\
148
        "r" (dest), "g" (width)\
149
        : "%"REG_a, "%"REG_d, "%"REG_S\
150
    );
151

    
152
#define YSCALEYUV2YV121 \
153
    "mov %2, %%"REG_a"                    \n\t"\
154
    ASMALIGN(4) /* FIXME Unroll? */\
155
    "1:                                   \n\t"\
156
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
157
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
158
    "psraw                 $7, %%mm0      \n\t"\
159
    "psraw                 $7, %%mm1      \n\t"\
160
    "packuswb           %%mm1, %%mm0      \n\t"\
161
    MOVNTQ(%%mm0, (%1, %%REGa))\
162
    "add                   $8, %%"REG_a"  \n\t"\
163
    "jnc                   1b             \n\t"
164

    
165
#define YSCALEYUV2YV121_ACCURATE \
166
    "mov %2, %%"REG_a"                    \n\t"\
167
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
168
    "psrlw                 $15, %%mm7     \n\t"\
169
    "psllw                  $6, %%mm7     \n\t"\
170
    ASMALIGN(4) /* FIXME Unroll? */\
171
    "1:                                   \n\t"\
172
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
173
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
174
    "paddsw             %%mm7, %%mm0      \n\t"\
175
    "paddsw             %%mm7, %%mm1      \n\t"\
176
    "psraw                 $7, %%mm0      \n\t"\
177
    "psraw                 $7, %%mm1      \n\t"\
178
    "packuswb           %%mm1, %%mm0      \n\t"\
179
    MOVNTQ(%%mm0, (%1, %%REGa))\
180
    "add                   $8, %%"REG_a"  \n\t"\
181
    "jnc                   1b             \n\t"
182

    
183
/*
184
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
185
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
186
       "r" (dest), "m" (dstW),
187
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
188
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
189
*/
190
#define YSCALEYUV2PACKEDX_UV \
191
    __asm__ volatile(\
192
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
193
        ASMALIGN(4)\
194
        "nop                                            \n\t"\
195
        "1:                                             \n\t"\
196
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
197
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
198
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
199
        "movq                      %%mm3, %%mm4         \n\t"\
200
        ASMALIGN(4)\
201
        "2:                                             \n\t"\
202
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
203
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
204
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
205
        "add                         $16, %%"REG_d"     \n\t"\
206
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
207
        "pmulhw                    %%mm0, %%mm2         \n\t"\
208
        "pmulhw                    %%mm0, %%mm5         \n\t"\
209
        "paddw                     %%mm2, %%mm3         \n\t"\
210
        "paddw                     %%mm5, %%mm4         \n\t"\
211
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
212
        " jnz                         2b                \n\t"\
213

    
214
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
215
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
218
    "movq                    "#dst1", "#dst2"       \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
222
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
223
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
224
    "add                         $16, %%"REG_d"            \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                 "#coeff", "#src1"       \n\t"\
227
    "pmulhw                 "#coeff", "#src2"       \n\t"\
228
    "paddw                   "#src1", "#dst1"       \n\t"\
229
    "paddw                   "#src2", "#dst2"       \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX \
234
    YSCALEYUV2PACKEDX_UV \
235
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
236

    
237
#define YSCALEYUV2PACKEDX_END                     \
238
        :: "r" (&c->redDither),                   \
239
            "m" (dummy), "m" (dummy), "m" (dummy),\
240
            "r" (dest), "m" (dstW)                \
241
        : "%"REG_a, "%"REG_d, "%"REG_S            \
242
    );
243

    
244
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
245
    __asm__ volatile(\
246
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
247
        ASMALIGN(4)\
248
        "nop                                            \n\t"\
249
        "1:                                             \n\t"\
250
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
251
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
252
        "pxor                      %%mm4, %%mm4         \n\t"\
253
        "pxor                      %%mm5, %%mm5         \n\t"\
254
        "pxor                      %%mm6, %%mm6         \n\t"\
255
        "pxor                      %%mm7, %%mm7         \n\t"\
256
        ASMALIGN(4)\
257
        "2:                                             \n\t"\
258
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
259
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
260
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
261
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
262
        "movq                      %%mm0, %%mm3         \n\t"\
263
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
264
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
265
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
266
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
267
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
268
        "paddd                     %%mm0, %%mm4         \n\t"\
269
        "paddd                     %%mm3, %%mm5         \n\t"\
270
        "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
271
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
272
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
273
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
274
        "movq                      %%mm2, %%mm0         \n\t"\
275
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
276
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
277
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
278
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
279
        "paddd                     %%mm2, %%mm6         \n\t"\
280
        "paddd                     %%mm0, %%mm7         \n\t"\
281
        " jnz                         2b                \n\t"\
282
        "psrad                       $16, %%mm4         \n\t"\
283
        "psrad                       $16, %%mm5         \n\t"\
284
        "psrad                       $16, %%mm6         \n\t"\
285
        "psrad                       $16, %%mm7         \n\t"\
286
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
287
        "packssdw                  %%mm5, %%mm4         \n\t"\
288
        "packssdw                  %%mm7, %%mm6         \n\t"\
289
        "paddw                     %%mm0, %%mm4         \n\t"\
290
        "paddw                     %%mm0, %%mm6         \n\t"\
291
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
292
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
293

    
294
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
295
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
296
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
297
    "pxor                      %%mm1, %%mm1         \n\t"\
298
    "pxor                      %%mm5, %%mm5         \n\t"\
299
    "pxor                      %%mm7, %%mm7         \n\t"\
300
    "pxor                      %%mm6, %%mm6         \n\t"\
301
    ASMALIGN(4)\
302
    "2:                                             \n\t"\
303
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
304
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
305
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
306
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
307
    "movq                      %%mm0, %%mm3         \n\t"\
308
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
309
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
310
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
311
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
312
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
313
    "paddd                     %%mm0, %%mm1         \n\t"\
314
    "paddd                     %%mm3, %%mm5         \n\t"\
315
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
316
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
317
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
318
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
319
    "movq                      %%mm2, %%mm0         \n\t"\
320
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
321
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
322
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
323
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
324
    "paddd                     %%mm2, %%mm7         \n\t"\
325
    "paddd                     %%mm0, %%mm6         \n\t"\
326
    " jnz                         2b                \n\t"\
327
    "psrad                       $16, %%mm1         \n\t"\
328
    "psrad                       $16, %%mm5         \n\t"\
329
    "psrad                       $16, %%mm7         \n\t"\
330
    "psrad                       $16, %%mm6         \n\t"\
331
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
332
    "packssdw                  %%mm5, %%mm1         \n\t"\
333
    "packssdw                  %%mm6, %%mm7         \n\t"\
334
    "paddw                     %%mm0, %%mm1         \n\t"\
335
    "paddw                     %%mm0, %%mm7         \n\t"\
336
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
337
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
338

    
339
#define YSCALEYUV2PACKEDX_ACCURATE \
340
    YSCALEYUV2PACKEDX_ACCURATE_UV \
341
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
342

    
343
#define YSCALEYUV2RGBX \
344
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
345
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
346
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
347
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
348
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
349
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
350
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
351
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
352
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
353
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
354
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
355
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
356
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
357
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
358
    "paddw           %%mm3, %%mm4       \n\t"\
359
    "movq            %%mm2, %%mm0       \n\t"\
360
    "movq            %%mm5, %%mm6       \n\t"\
361
    "movq            %%mm4, %%mm3       \n\t"\
362
    "punpcklwd       %%mm2, %%mm2       \n\t"\
363
    "punpcklwd       %%mm5, %%mm5       \n\t"\
364
    "punpcklwd       %%mm4, %%mm4       \n\t"\
365
    "paddw           %%mm1, %%mm2       \n\t"\
366
    "paddw           %%mm1, %%mm5       \n\t"\
367
    "paddw           %%mm1, %%mm4       \n\t"\
368
    "punpckhwd       %%mm0, %%mm0       \n\t"\
369
    "punpckhwd       %%mm6, %%mm6       \n\t"\
370
    "punpckhwd       %%mm3, %%mm3       \n\t"\
371
    "paddw           %%mm7, %%mm0       \n\t"\
372
    "paddw           %%mm7, %%mm6       \n\t"\
373
    "paddw           %%mm7, %%mm3       \n\t"\
374
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
375
    "packuswb        %%mm0, %%mm2       \n\t"\
376
    "packuswb        %%mm6, %%mm5       \n\t"\
377
    "packuswb        %%mm3, %%mm4       \n\t"\
378

    
379
#define REAL_YSCALEYUV2PACKED(index, c) \
380
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
381
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
382
    "psraw                $3, %%mm0                           \n\t"\
383
    "psraw                $3, %%mm1                           \n\t"\
384
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
385
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
386
    "xor            "#index", "#index"                        \n\t"\
387
    ASMALIGN(4)\
388
    "1:                                 \n\t"\
389
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
390
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
391
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
392
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
393
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
394
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
395
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
396
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
397
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
398
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
399
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
400
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
401
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
402
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
403
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
404
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
405
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
406
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
407
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
408
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
409
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
410
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
411
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
412
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
413
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
414

    
415
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
416

    
417
#define REAL_YSCALEYUV2RGB_UV(index, c) \
418
    "xor            "#index", "#index"  \n\t"\
419
    ASMALIGN(4)\
420
    "1:                                 \n\t"\
421
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
422
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
423
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
424
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
425
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
426
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
427
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
428
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
429
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
430
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
431
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
432
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
433
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
434
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
435
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
436
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
437
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
438
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
439
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
440
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
441

    
442
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
443
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
444
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
445
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
446
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
447
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
448
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
449
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
450
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
451
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
452
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
453
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
454
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
455

    
456
#define REAL_YSCALEYUV2RGB_COEFF(c) \
457
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
458
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
459
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
460
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
461
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
462
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
463
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
464
    "paddw             %%mm3, %%mm4     \n\t"\
465
    "movq              %%mm2, %%mm0     \n\t"\
466
    "movq              %%mm5, %%mm6     \n\t"\
467
    "movq              %%mm4, %%mm3     \n\t"\
468
    "punpcklwd         %%mm2, %%mm2     \n\t"\
469
    "punpcklwd         %%mm5, %%mm5     \n\t"\
470
    "punpcklwd         %%mm4, %%mm4     \n\t"\
471
    "paddw             %%mm1, %%mm2     \n\t"\
472
    "paddw             %%mm1, %%mm5     \n\t"\
473
    "paddw             %%mm1, %%mm4     \n\t"\
474
    "punpckhwd         %%mm0, %%mm0     \n\t"\
475
    "punpckhwd         %%mm6, %%mm6     \n\t"\
476
    "punpckhwd         %%mm3, %%mm3     \n\t"\
477
    "paddw             %%mm7, %%mm0     \n\t"\
478
    "paddw             %%mm7, %%mm6     \n\t"\
479
    "paddw             %%mm7, %%mm3     \n\t"\
480
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
481
    "packuswb          %%mm0, %%mm2     \n\t"\
482
    "packuswb          %%mm6, %%mm5     \n\t"\
483
    "packuswb          %%mm3, %%mm4     \n\t"\
484

    
485
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
486

    
487
#define YSCALEYUV2RGB(index, c) \
488
    REAL_YSCALEYUV2RGB_UV(index, c) \
489
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
490
    REAL_YSCALEYUV2RGB_COEFF(c)
491

    
492
#define REAL_YSCALEYUV2PACKED1(index, c) \
493
    "xor            "#index", "#index"  \n\t"\
494
    ASMALIGN(4)\
495
    "1:                                 \n\t"\
496
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
497
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
498
    "psraw                $7, %%mm3     \n\t" \
499
    "psraw                $7, %%mm4     \n\t" \
500
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
501
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
502
    "psraw                $7, %%mm1     \n\t" \
503
    "psraw                $7, %%mm7     \n\t" \
504

    
505
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
506

    
507
#define REAL_YSCALEYUV2RGB1(index, c) \
508
    "xor            "#index", "#index"  \n\t"\
509
    ASMALIGN(4)\
510
    "1:                                 \n\t"\
511
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
512
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
513
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
514
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
515
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
516
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
517
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
518
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
519
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
520
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
521
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
522
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
523
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
524
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
525
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
526
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
527
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
528
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
529
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
530
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
531
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
532
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
533
    "paddw             %%mm3, %%mm4     \n\t"\
534
    "movq              %%mm2, %%mm0     \n\t"\
535
    "movq              %%mm5, %%mm6     \n\t"\
536
    "movq              %%mm4, %%mm3     \n\t"\
537
    "punpcklwd         %%mm2, %%mm2     \n\t"\
538
    "punpcklwd         %%mm5, %%mm5     \n\t"\
539
    "punpcklwd         %%mm4, %%mm4     \n\t"\
540
    "paddw             %%mm1, %%mm2     \n\t"\
541
    "paddw             %%mm1, %%mm5     \n\t"\
542
    "paddw             %%mm1, %%mm4     \n\t"\
543
    "punpckhwd         %%mm0, %%mm0     \n\t"\
544
    "punpckhwd         %%mm6, %%mm6     \n\t"\
545
    "punpckhwd         %%mm3, %%mm3     \n\t"\
546
    "paddw             %%mm7, %%mm0     \n\t"\
547
    "paddw             %%mm7, %%mm6     \n\t"\
548
    "paddw             %%mm7, %%mm3     \n\t"\
549
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
550
    "packuswb          %%mm0, %%mm2     \n\t"\
551
    "packuswb          %%mm6, %%mm5     \n\t"\
552
    "packuswb          %%mm3, %%mm4     \n\t"\
553

    
554
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
555

    
556
#define REAL_YSCALEYUV2PACKED1b(index, c) \
557
    "xor "#index", "#index"             \n\t"\
558
    ASMALIGN(4)\
559
    "1:                                 \n\t"\
560
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
561
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
562
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
563
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
564
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
565
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
566
    "psrlw                $8, %%mm3     \n\t" \
567
    "psrlw                $8, %%mm4     \n\t" \
568
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
569
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
570
    "psraw                $7, %%mm1     \n\t" \
571
    "psraw                $7, %%mm7     \n\t"
572
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
573

    
574
// do vertical chrominance interpolation
575
#define REAL_YSCALEYUV2RGB1b(index, c) \
576
    "xor            "#index", "#index"  \n\t"\
577
    ASMALIGN(4)\
578
    "1:                                 \n\t"\
579
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
580
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
581
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
582
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
583
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
584
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
585
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
586
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
587
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
588
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
589
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
590
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
591
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
592
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
593
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
594
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
595
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
596
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
597
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
598
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
599
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
600
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
601
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
602
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
603
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
604
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
605
    "paddw             %%mm3, %%mm4     \n\t"\
606
    "movq              %%mm2, %%mm0     \n\t"\
607
    "movq              %%mm5, %%mm6     \n\t"\
608
    "movq              %%mm4, %%mm3     \n\t"\
609
    "punpcklwd         %%mm2, %%mm2     \n\t"\
610
    "punpcklwd         %%mm5, %%mm5     \n\t"\
611
    "punpcklwd         %%mm4, %%mm4     \n\t"\
612
    "paddw             %%mm1, %%mm2     \n\t"\
613
    "paddw             %%mm1, %%mm5     \n\t"\
614
    "paddw             %%mm1, %%mm4     \n\t"\
615
    "punpckhwd         %%mm0, %%mm0     \n\t"\
616
    "punpckhwd         %%mm6, %%mm6     \n\t"\
617
    "punpckhwd         %%mm3, %%mm3     \n\t"\
618
    "paddw             %%mm7, %%mm0     \n\t"\
619
    "paddw             %%mm7, %%mm6     \n\t"\
620
    "paddw             %%mm7, %%mm3     \n\t"\
621
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
622
    "packuswb          %%mm0, %%mm2     \n\t"\
623
    "packuswb          %%mm6, %%mm5     \n\t"\
624
    "packuswb          %%mm3, %%mm4     \n\t"\
625

    
626
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
627

    
628
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
629
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
630
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
631
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
632
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
633
    "packuswb          %%mm1, %%mm7     \n\t"
634
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
635

    
636
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
637
    "movq       "#b", "#q2"     \n\t" /* B */\
638
    "movq       "#r", "#t"      \n\t" /* R */\
639
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
640
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
641
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
642
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
643
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
644
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
645
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
646
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
647
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
648
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
649
\
650
    MOVNTQ(   q0,   (dst, index, 4))\
651
    MOVNTQ(    b,  8(dst, index, 4))\
652
    MOVNTQ(   q2, 16(dst, index, 4))\
653
    MOVNTQ(   q3, 24(dst, index, 4))\
654
\
655
    "add      $8, "#index"      \n\t"\
656
    "cmp "#dstw", "#index"      \n\t"\
657
    " jb      1b                \n\t"
658
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
659

    
660
#define REAL_WRITERGB16(dst, dstw, index) \
661
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
662
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
663
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
664
    "psrlq           $3, %%mm2  \n\t"\
665
\
666
    "movq         %%mm2, %%mm1  \n\t"\
667
    "movq         %%mm4, %%mm3  \n\t"\
668
\
669
    "punpcklbw    %%mm7, %%mm3  \n\t"\
670
    "punpcklbw    %%mm5, %%mm2  \n\t"\
671
    "punpckhbw    %%mm7, %%mm4  \n\t"\
672
    "punpckhbw    %%mm5, %%mm1  \n\t"\
673
\
674
    "psllq           $3, %%mm3  \n\t"\
675
    "psllq           $3, %%mm4  \n\t"\
676
\
677
    "por          %%mm3, %%mm2  \n\t"\
678
    "por          %%mm4, %%mm1  \n\t"\
679
\
680
    MOVNTQ(%%mm2,  (dst, index, 2))\
681
    MOVNTQ(%%mm1, 8(dst, index, 2))\
682
\
683
    "add             $8, "#index"   \n\t"\
684
    "cmp        "#dstw", "#index"   \n\t"\
685
    " jb             1b             \n\t"
686
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
687

    
688
#define REAL_WRITERGB15(dst, dstw, index) \
689
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
690
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
691
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
692
    "psrlq           $3, %%mm2  \n\t"\
693
    "psrlq           $1, %%mm5  \n\t"\
694
\
695
    "movq         %%mm2, %%mm1  \n\t"\
696
    "movq         %%mm4, %%mm3  \n\t"\
697
\
698
    "punpcklbw    %%mm7, %%mm3  \n\t"\
699
    "punpcklbw    %%mm5, %%mm2  \n\t"\
700
    "punpckhbw    %%mm7, %%mm4  \n\t"\
701
    "punpckhbw    %%mm5, %%mm1  \n\t"\
702
\
703
    "psllq           $2, %%mm3  \n\t"\
704
    "psllq           $2, %%mm4  \n\t"\
705
\
706
    "por          %%mm3, %%mm2  \n\t"\
707
    "por          %%mm4, %%mm1  \n\t"\
708
\
709
    MOVNTQ(%%mm2,  (dst, index, 2))\
710
    MOVNTQ(%%mm1, 8(dst, index, 2))\
711
\
712
    "add             $8, "#index"   \n\t"\
713
    "cmp        "#dstw", "#index"   \n\t"\
714
    " jb             1b             \n\t"
715
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
716

    
717
#define WRITEBGR24OLD(dst, dstw, index) \
718
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
719
    "movq      %%mm2, %%mm1             \n\t" /* B */\
720
    "movq      %%mm5, %%mm6             \n\t" /* R */\
721
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
722
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
723
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
724
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
725
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
726
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
727
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
728
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
729
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
730
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
731
\
732
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
733
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
734
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
735
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
736
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
737
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
738
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
739
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
740
\
741
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
742
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
743
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
744
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
745
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
746
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
747
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
748
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
749
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
750
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
751
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
752
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
753
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
754
\
755
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
756
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
757
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
758
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
759
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
760
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
761
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
762
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
763
\
764
    MOVNTQ(%%mm0,   (dst))\
765
    MOVNTQ(%%mm2,  8(dst))\
766
    MOVNTQ(%%mm3, 16(dst))\
767
    "add         $24, "#dst"            \n\t"\
768
\
769
    "add          $8, "#index"          \n\t"\
770
    "cmp     "#dstw", "#index"          \n\t"\
771
    " jb          1b                    \n\t"
772

    
773
#define WRITEBGR24MMX(dst, dstw, index) \
774
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
775
    "movq      %%mm2, %%mm1     \n\t" /* B */\
776
    "movq      %%mm5, %%mm6     \n\t" /* R */\
777
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
778
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
779
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
780
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
781
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
782
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
783
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
784
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
785
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
786
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
787
\
788
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
789
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
790
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
791
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
792
\
793
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
794
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
795
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
796
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
797
\
798
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
799
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
800
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
801
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
802
\
803
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
804
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
805
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
806
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
807
    MOVNTQ(%%mm0, (dst))\
808
\
809
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
810
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
811
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
812
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
813
    MOVNTQ(%%mm6, 8(dst))\
814
\
815
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
816
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
817
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
818
    MOVNTQ(%%mm5, 16(dst))\
819
\
820
    "add         $24, "#dst"    \n\t"\
821
\
822
    "add          $8, "#index"  \n\t"\
823
    "cmp     "#dstw", "#index"  \n\t"\
824
    " jb          1b            \n\t"
825

    
826
#define WRITEBGR24MMX2(dst, dstw, index) \
827
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
828
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
829
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
830
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
831
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
832
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
833
\
834
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
835
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
836
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
837
\
838
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
839
    "por    %%mm1, %%mm6        \n\t"\
840
    "por    %%mm3, %%mm6        \n\t"\
841
    MOVNTQ(%%mm6, (dst))\
842
\
843
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
844
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
845
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
846
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
847
\
848
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
849
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
850
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
851
\
852
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
853
    "por    %%mm3, %%mm6        \n\t"\
854
    MOVNTQ(%%mm6, 8(dst))\
855
\
856
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
857
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
858
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
859
\
860
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
861
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
862
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
863
\
864
    "por    %%mm1, %%mm3        \n\t"\
865
    "por    %%mm3, %%mm6        \n\t"\
866
    MOVNTQ(%%mm6, 16(dst))\
867
\
868
    "add      $24, "#dst"       \n\t"\
869
\
870
    "add       $8, "#index"     \n\t"\
871
    "cmp  "#dstw", "#index"     \n\t"\
872
    " jb       1b               \n\t"
873

    
874
#if COMPILE_TEMPLATE_MMX2
875
#undef WRITEBGR24
876
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
877
#else
878
#undef WRITEBGR24
879
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
880
#endif
881

    
882
#define REAL_WRITEYUY2(dst, dstw, index) \
883
    "packuswb  %%mm3, %%mm3     \n\t"\
884
    "packuswb  %%mm4, %%mm4     \n\t"\
885
    "packuswb  %%mm7, %%mm1     \n\t"\
886
    "punpcklbw %%mm4, %%mm3     \n\t"\
887
    "movq      %%mm1, %%mm7     \n\t"\
888
    "punpcklbw %%mm3, %%mm1     \n\t"\
889
    "punpckhbw %%mm3, %%mm7     \n\t"\
890
\
891
    MOVNTQ(%%mm1, (dst, index, 2))\
892
    MOVNTQ(%%mm7, 8(dst, index, 2))\
893
\
894
    "add          $8, "#index"  \n\t"\
895
    "cmp     "#dstw", "#index"  \n\t"\
896
    " jb          1b            \n\t"
897
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
898

    
899

    
900
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
901
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
902
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
903
{
904
#if COMPILE_TEMPLATE_MMX
905
    if(!(c->flags & SWS_BITEXACT)) {
906
        if (c->flags & SWS_ACCURATE_RND) {
907
            if (uDest) {
908
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
909
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
910
            }
911
            if (CONFIG_SWSCALE_ALPHA && aDest) {
912
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
913
            }
914

    
915
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
916
        } else {
917
            if (uDest) {
918
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
919
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
920
            }
921
            if (CONFIG_SWSCALE_ALPHA && aDest) {
922
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
923
            }
924

    
925
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
926
        }
927
        return;
928
    }
929
#endif
930
#if COMPILE_TEMPLATE_ALTIVEC
931
    yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
932
                          chrFilter, chrSrc, chrFilterSize,
933
                          dest, uDest, vDest, dstW, chrDstW);
934
#else //COMPILE_TEMPLATE_ALTIVEC
935
    yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
936
                chrFilter, chrSrc, chrFilterSize,
937
                alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
938
#endif //!COMPILE_TEMPLATE_ALTIVEC
939
}
940

    
941
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
942
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
943
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, enum PixelFormat dstFormat)
944
{
945
    yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
946
                 chrFilter, chrSrc, chrFilterSize,
947
                 dest, uDest, dstW, chrDstW, dstFormat);
948
}
949

    
950
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
951
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
952
{
953
    int i;
954
#if COMPILE_TEMPLATE_MMX
955
    if(!(c->flags & SWS_BITEXACT)) {
956
        long p= 4;
957
        uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
958
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
959
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
960

    
961
        if (c->flags & SWS_ACCURATE_RND) {
962
            while(p--) {
963
                if (dst[p]) {
964
                    __asm__ volatile(
965
                        YSCALEYUV2YV121_ACCURATE
966
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
967
                        "g" (-counter[p])
968
                        : "%"REG_a
969
                    );
970
                }
971
            }
972
        } else {
973
            while(p--) {
974
                if (dst[p]) {
975
                    __asm__ volatile(
976
                        YSCALEYUV2YV121
977
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
978
                        "g" (-counter[p])
979
                        : "%"REG_a
980
                    );
981
                }
982
            }
983
        }
984
        return;
985
    }
986
#endif
987
    for (i=0; i<dstW; i++) {
988
        int val= (lumSrc[i]+64)>>7;
989

    
990
        if (val&256) {
991
            if (val<0) val=0;
992
            else       val=255;
993
        }
994

    
995
        dest[i]= val;
996
    }
997

    
998
    if (uDest)
999
        for (i=0; i<chrDstW; i++) {
1000
            int u=(chrSrc[i       ]+64)>>7;
1001
            int v=(chrSrc[i + VOFW]+64)>>7;
1002

    
1003
            if ((u|v)&256) {
1004
                if (u<0)        u=0;
1005
                else if (u>255) u=255;
1006
                if (v<0)        v=0;
1007
                else if (v>255) v=255;
1008
            }
1009

    
1010
            uDest[i]= u;
1011
            vDest[i]= v;
1012
        }
1013

    
1014
    if (CONFIG_SWSCALE_ALPHA && aDest)
1015
        for (i=0; i<dstW; i++) {
1016
            int val= (alpSrc[i]+64)>>7;
1017
            aDest[i]= av_clip_uint8(val);
1018
        }
1019
}
1020

    
1021

    
1022
/**
1023
 * vertical scale YV12 to RGB
1024
 */
1025
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1026
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1027
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1028
{
1029
#if COMPILE_TEMPLATE_MMX
1030
    x86_reg dummy=0;
1031
    if(!(c->flags & SWS_BITEXACT)) {
1032
        if (c->flags & SWS_ACCURATE_RND) {
1033
            switch(c->dstFormat) {
1034
            case PIX_FMT_RGB32:
1035
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1036
                    YSCALEYUV2PACKEDX_ACCURATE
1037
                    YSCALEYUV2RGBX
1038
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1039
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1040
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1041
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1042
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1043
                    "psraw                        $3, %%mm1         \n\t"
1044
                    "psraw                        $3, %%mm7         \n\t"
1045
                    "packuswb                  %%mm7, %%mm1         \n\t"
1046
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1047

    
1048
                    YSCALEYUV2PACKEDX_END
1049
                } else {
1050
                    YSCALEYUV2PACKEDX_ACCURATE
1051
                    YSCALEYUV2RGBX
1052
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1053
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1054

    
1055
                    YSCALEYUV2PACKEDX_END
1056
                }
1057
                return;
1058
            case PIX_FMT_BGR24:
1059
                YSCALEYUV2PACKEDX_ACCURATE
1060
                YSCALEYUV2RGBX
1061
                "pxor %%mm7, %%mm7 \n\t"
1062
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1063
                "add %4, %%"REG_c"                        \n\t"
1064
                WRITEBGR24(%%REGc, %5, %%REGa)
1065

    
1066

    
1067
                :: "r" (&c->redDither),
1068
                "m" (dummy), "m" (dummy), "m" (dummy),
1069
                "r" (dest), "m" (dstW)
1070
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1071
                );
1072
                return;
1073
            case PIX_FMT_RGB555:
1074
                YSCALEYUV2PACKEDX_ACCURATE
1075
                YSCALEYUV2RGBX
1076
                "pxor %%mm7, %%mm7 \n\t"
1077
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1078
#ifdef DITHER1XBPP
1079
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1080
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1081
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1082
#endif
1083

    
1084
                WRITERGB15(%4, %5, %%REGa)
1085
                YSCALEYUV2PACKEDX_END
1086
                return;
1087
            case PIX_FMT_RGB565:
1088
                YSCALEYUV2PACKEDX_ACCURATE
1089
                YSCALEYUV2RGBX
1090
                "pxor %%mm7, %%mm7 \n\t"
1091
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1092
#ifdef DITHER1XBPP
1093
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1094
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1095
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1096
#endif
1097

    
1098
                WRITERGB16(%4, %5, %%REGa)
1099
                YSCALEYUV2PACKEDX_END
1100
                return;
1101
            case PIX_FMT_YUYV422:
1102
                YSCALEYUV2PACKEDX_ACCURATE
1103
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1104

    
1105
                "psraw $3, %%mm3    \n\t"
1106
                "psraw $3, %%mm4    \n\t"
1107
                "psraw $3, %%mm1    \n\t"
1108
                "psraw $3, %%mm7    \n\t"
1109
                WRITEYUY2(%4, %5, %%REGa)
1110
                YSCALEYUV2PACKEDX_END
1111
                return;
1112
            }
1113
        } else {
1114
            switch(c->dstFormat) {
1115
            case PIX_FMT_RGB32:
1116
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1117
                    YSCALEYUV2PACKEDX
1118
                    YSCALEYUV2RGBX
1119
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1120
                    "psraw                        $3, %%mm1         \n\t"
1121
                    "psraw                        $3, %%mm7         \n\t"
1122
                    "packuswb                  %%mm7, %%mm1         \n\t"
1123
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1124
                    YSCALEYUV2PACKEDX_END
1125
                } else {
1126
                    YSCALEYUV2PACKEDX
1127
                    YSCALEYUV2RGBX
1128
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1129
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1130
                    YSCALEYUV2PACKEDX_END
1131
                }
1132
                return;
1133
            case PIX_FMT_BGR24:
1134
                YSCALEYUV2PACKEDX
1135
                YSCALEYUV2RGBX
1136
                "pxor                    %%mm7, %%mm7       \n\t"
1137
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1138
                "add                        %4, %%"REG_c"   \n\t"
1139
                WRITEBGR24(%%REGc, %5, %%REGa)
1140

    
1141
                :: "r" (&c->redDither),
1142
                "m" (dummy), "m" (dummy), "m" (dummy),
1143
                "r" (dest),  "m" (dstW)
1144
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1145
                );
1146
                return;
1147
            case PIX_FMT_RGB555:
1148
                YSCALEYUV2PACKEDX
1149
                YSCALEYUV2RGBX
1150
                "pxor %%mm7, %%mm7 \n\t"
1151
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1152
#ifdef DITHER1XBPP
1153
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1154
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1155
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1156
#endif
1157

    
1158
                WRITERGB15(%4, %5, %%REGa)
1159
                YSCALEYUV2PACKEDX_END
1160
                return;
1161
            case PIX_FMT_RGB565:
1162
                YSCALEYUV2PACKEDX
1163
                YSCALEYUV2RGBX
1164
                "pxor %%mm7, %%mm7 \n\t"
1165
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1166
#ifdef DITHER1XBPP
1167
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1168
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1169
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1170
#endif
1171

    
1172
                WRITERGB16(%4, %5, %%REGa)
1173
                YSCALEYUV2PACKEDX_END
1174
                return;
1175
            case PIX_FMT_YUYV422:
1176
                YSCALEYUV2PACKEDX
1177
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1178

    
1179
                "psraw $3, %%mm3    \n\t"
1180
                "psraw $3, %%mm4    \n\t"
1181
                "psraw $3, %%mm1    \n\t"
1182
                "psraw $3, %%mm7    \n\t"
1183
                WRITEYUY2(%4, %5, %%REGa)
1184
                YSCALEYUV2PACKEDX_END
1185
                return;
1186
            }
1187
        }
1188
    }
1189
#endif /* COMPILE_TEMPLATE_MMX */
1190
#if COMPILE_TEMPLATE_ALTIVEC
1191
    /* The following list of supported dstFormat values should
1192
       match what's found in the body of ff_yuv2packedX_altivec() */
1193
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1194
         (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1195
          c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1196
          c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1197
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1198
                                   chrFilter, chrSrc, chrFilterSize,
1199
                                   dest, dstW, dstY);
1200
    else
1201
#endif
1202
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1203
                       chrFilter, chrSrc, chrFilterSize,
1204
                       alpSrc, dest, dstW, dstY);
1205
}
1206

    
1207
/**
1208
 * vertical bilinear scale YV12 to RGB
1209
 */
1210
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1211
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1212
{
1213
    int  yalpha1=4095- yalpha;
1214
    int uvalpha1=4095-uvalpha;
1215
    int i;
1216

    
1217
#if COMPILE_TEMPLATE_MMX
1218
    if(!(c->flags & SWS_BITEXACT)) {
1219
        switch(c->dstFormat) {
1220
        //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1221
        case PIX_FMT_RGB32:
1222
            if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1223
#if ARCH_X86_64
1224
                __asm__ volatile(
1225
                    YSCALEYUV2RGB(%%r8, %5)
1226
                    YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
1227
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1228
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1229
                    "packuswb            %%mm7, %%mm1       \n\t"
1230
                    WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1231

    
1232
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1233
                    "a" (&c->redDither)
1234
                    ,"r" (abuf0), "r" (abuf1)
1235
                    : "%r8"
1236
                );
1237
#else
1238
                *(uint16_t **)(&c->u_temp)=abuf0;
1239
                *(uint16_t **)(&c->v_temp)=abuf1;
1240
                __asm__ volatile(
1241
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1242
                    "mov        %4, %%"REG_b"               \n\t"
1243
                    "push %%"REG_BP"                        \n\t"
1244
                    YSCALEYUV2RGB(%%REGBP, %5)
1245
                    "push                   %0              \n\t"
1246
                    "push                   %1              \n\t"
1247
                    "mov          "U_TEMP"(%5), %0          \n\t"
1248
                    "mov          "V_TEMP"(%5), %1          \n\t"
1249
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1250
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1251
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1252
                    "packuswb            %%mm7, %%mm1       \n\t"
1253
                    "pop                    %1              \n\t"
1254
                    "pop                    %0              \n\t"
1255
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1256
                    "pop %%"REG_BP"                         \n\t"
1257
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1258

    
1259
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1260
                    "a" (&c->redDither)
1261
                );
1262
#endif
1263
            } else {
1264
                __asm__ volatile(
1265
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1266
                    "mov        %4, %%"REG_b"               \n\t"
1267
                    "push %%"REG_BP"                        \n\t"
1268
                    YSCALEYUV2RGB(%%REGBP, %5)
1269
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1270
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1271
                    "pop %%"REG_BP"                         \n\t"
1272
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1273

    
1274
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1275
                    "a" (&c->redDither)
1276
                );
1277
            }
1278
            return;
1279
        case PIX_FMT_BGR24:
1280
            __asm__ volatile(
1281
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1282
                "mov        %4, %%"REG_b"               \n\t"
1283
                "push %%"REG_BP"                        \n\t"
1284
                YSCALEYUV2RGB(%%REGBP, %5)
1285
                "pxor    %%mm7, %%mm7                   \n\t"
1286
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1287
                "pop %%"REG_BP"                         \n\t"
1288
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1289
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1290
                "a" (&c->redDither)
1291
            );
1292
            return;
1293
        case PIX_FMT_RGB555:
1294
            __asm__ volatile(
1295
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1296
                "mov        %4, %%"REG_b"               \n\t"
1297
                "push %%"REG_BP"                        \n\t"
1298
                YSCALEYUV2RGB(%%REGBP, %5)
1299
                "pxor    %%mm7, %%mm7                   \n\t"
1300
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1301
#ifdef DITHER1XBPP
1302
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1303
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1304
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1305
#endif
1306

    
1307
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1308
                "pop %%"REG_BP"                         \n\t"
1309
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1310

    
1311
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1312
                "a" (&c->redDither)
1313
            );
1314
            return;
1315
        case PIX_FMT_RGB565:
1316
            __asm__ volatile(
1317
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1318
                "mov        %4, %%"REG_b"               \n\t"
1319
                "push %%"REG_BP"                        \n\t"
1320
                YSCALEYUV2RGB(%%REGBP, %5)
1321
                "pxor    %%mm7, %%mm7                   \n\t"
1322
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1323
#ifdef DITHER1XBPP
1324
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1325
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1326
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1327
#endif
1328

    
1329
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1330
                "pop %%"REG_BP"                         \n\t"
1331
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1332
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1333
                "a" (&c->redDither)
1334
            );
1335
            return;
1336
        case PIX_FMT_YUYV422:
1337
            __asm__ volatile(
1338
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1339
                "mov %4, %%"REG_b"                        \n\t"
1340
                "push %%"REG_BP"                        \n\t"
1341
                YSCALEYUV2PACKED(%%REGBP, %5)
1342
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1343
                "pop %%"REG_BP"                         \n\t"
1344
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1345
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1346
                "a" (&c->redDither)
1347
            );
1348
            return;
1349
        default: break;
1350
        }
1351
    }
1352
#endif //COMPILE_TEMPLATE_MMX
1353
    YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1354
}
1355

    
1356
/**
1357
 * YV12 to RGB without scaling or interpolating
1358
 */
1359
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1360
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, enum PixelFormat dstFormat, int flags, int y)
1361
{
1362
    const int yalpha1=0;
1363
    int i;
1364

    
1365
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1366
    const int yalpha= 4096; //FIXME ...
1367

    
1368
    if (flags&SWS_FULL_CHR_H_INT) {
1369
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1370
        return;
1371
    }
1372

    
1373
#if COMPILE_TEMPLATE_MMX
1374
    if(!(flags & SWS_BITEXACT)) {
1375
        if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1376
            switch(dstFormat) {
1377
            case PIX_FMT_RGB32:
1378
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1379
                    __asm__ volatile(
1380
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1381
                        "mov        %4, %%"REG_b"               \n\t"
1382
                        "push %%"REG_BP"                        \n\t"
1383
                        YSCALEYUV2RGB1(%%REGBP, %5)
1384
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1385
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1386
                        "pop %%"REG_BP"                         \n\t"
1387
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1388

    
1389
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1390
                        "a" (&c->redDither)
1391
                    );
1392
                } else {
1393
                    __asm__ volatile(
1394
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1395
                        "mov        %4, %%"REG_b"               \n\t"
1396
                        "push %%"REG_BP"                        \n\t"
1397
                        YSCALEYUV2RGB1(%%REGBP, %5)
1398
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1399
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1400
                        "pop %%"REG_BP"                         \n\t"
1401
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1402

    
1403
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1404
                        "a" (&c->redDither)
1405
                    );
1406
                }
1407
                return;
1408
            case PIX_FMT_BGR24:
1409
                __asm__ volatile(
1410
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1411
                    "mov        %4, %%"REG_b"               \n\t"
1412
                    "push %%"REG_BP"                        \n\t"
1413
                    YSCALEYUV2RGB1(%%REGBP, %5)
1414
                    "pxor    %%mm7, %%mm7                   \n\t"
1415
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1416
                    "pop %%"REG_BP"                         \n\t"
1417
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1418

    
1419
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1420
                    "a" (&c->redDither)
1421
                );
1422
                return;
1423
            case PIX_FMT_RGB555:
1424
                __asm__ volatile(
1425
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1426
                    "mov        %4, %%"REG_b"               \n\t"
1427
                    "push %%"REG_BP"                        \n\t"
1428
                    YSCALEYUV2RGB1(%%REGBP, %5)
1429
                    "pxor    %%mm7, %%mm7                   \n\t"
1430
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1431
#ifdef DITHER1XBPP
1432
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1433
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1434
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1435
#endif
1436
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1437
                    "pop %%"REG_BP"                         \n\t"
1438
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1439

    
1440
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441
                    "a" (&c->redDither)
1442
                );
1443
                return;
1444
            case PIX_FMT_RGB565:
1445
                __asm__ volatile(
1446
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1447
                    "mov        %4, %%"REG_b"               \n\t"
1448
                    "push %%"REG_BP"                        \n\t"
1449
                    YSCALEYUV2RGB1(%%REGBP, %5)
1450
                    "pxor    %%mm7, %%mm7                   \n\t"
1451
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1452
#ifdef DITHER1XBPP
1453
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1454
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1455
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1456
#endif
1457

    
1458
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1459
                    "pop %%"REG_BP"                         \n\t"
1460
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1461

    
1462
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1463
                    "a" (&c->redDither)
1464
                );
1465
                return;
1466
            case PIX_FMT_YUYV422:
1467
                __asm__ volatile(
1468
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1469
                    "mov        %4, %%"REG_b"               \n\t"
1470
                    "push %%"REG_BP"                        \n\t"
1471
                    YSCALEYUV2PACKED1(%%REGBP, %5)
1472
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1473
                    "pop %%"REG_BP"                         \n\t"
1474
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1475

    
1476
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1477
                    "a" (&c->redDither)
1478
                );
1479
                return;
1480
            }
1481
        } else {
1482
            switch(dstFormat) {
1483
            case PIX_FMT_RGB32:
1484
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1485
                    __asm__ volatile(
1486
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1487
                        "mov        %4, %%"REG_b"               \n\t"
1488
                        "push %%"REG_BP"                        \n\t"
1489
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1490
                        YSCALEYUV2RGB1_ALPHA(%%REGBP)
1491
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1492
                        "pop %%"REG_BP"                         \n\t"
1493
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1494

    
1495
                        :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1496
                        "a" (&c->redDither)
1497
                    );
1498
                } else {
1499
                    __asm__ volatile(
1500
                        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                        "mov        %4, %%"REG_b"               \n\t"
1502
                        "push %%"REG_BP"                        \n\t"
1503
                        YSCALEYUV2RGB1b(%%REGBP, %5)
1504
                        "pcmpeqd %%mm7, %%mm7                   \n\t"
1505
                        WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506
                        "pop %%"REG_BP"                         \n\t"
1507
                        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1508

    
1509
                        :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510
                        "a" (&c->redDither)
1511
                    );
1512
                }
1513
                return;
1514
            case PIX_FMT_BGR24:
1515
                __asm__ volatile(
1516
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1517
                    "mov        %4, %%"REG_b"               \n\t"
1518
                    "push %%"REG_BP"                        \n\t"
1519
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1520
                    "pxor    %%mm7, %%mm7                   \n\t"
1521
                    WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1522
                    "pop %%"REG_BP"                         \n\t"
1523
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1524

    
1525
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1526
                    "a" (&c->redDither)
1527
                );
1528
                return;
1529
            case PIX_FMT_RGB555:
1530
                __asm__ volatile(
1531
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1532
                    "mov        %4, %%"REG_b"               \n\t"
1533
                    "push %%"REG_BP"                        \n\t"
1534
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1535
                    "pxor    %%mm7, %%mm7                   \n\t"
1536
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1537
#ifdef DITHER1XBPP
1538
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1539
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1540
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1541
#endif
1542
                    WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1543
                    "pop %%"REG_BP"                         \n\t"
1544
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1545

    
1546
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1547
                    "a" (&c->redDither)
1548
                );
1549
                return;
1550
            case PIX_FMT_RGB565:
1551
                __asm__ volatile(
1552
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1553
                    "mov        %4, %%"REG_b"               \n\t"
1554
                    "push %%"REG_BP"                        \n\t"
1555
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1556
                    "pxor    %%mm7, %%mm7                   \n\t"
1557
                    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1558
#ifdef DITHER1XBPP
1559
                    "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1560
                    "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1561
                    "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1562
#endif
1563

    
1564
                    WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1565
                    "pop %%"REG_BP"                         \n\t"
1566
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1567

    
1568
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1569
                    "a" (&c->redDither)
1570
                );
1571
                return;
1572
            case PIX_FMT_YUYV422:
1573
                __asm__ volatile(
1574
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1575
                    "mov        %4, %%"REG_b"               \n\t"
1576
                    "push %%"REG_BP"                        \n\t"
1577
                    YSCALEYUV2PACKED1b(%%REGBP, %5)
1578
                    WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1579
                    "pop %%"REG_BP"                         \n\t"
1580
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1581

    
1582
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583
                    "a" (&c->redDither)
1584
                );
1585
                return;
1586
            }
1587
        }
1588
    }
1589
#endif /* COMPILE_TEMPLATE_MMX */
1590
    if (uvalpha < 2048) {
1591
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1592
    } else {
1593
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1594
    }
1595
}
1596

    
1597
//FIXME yuy2* can read up to 7 samples too much
1598

    
1599
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1600
{
1601
#if COMPILE_TEMPLATE_MMX
1602
    __asm__ volatile(
1603
        "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1604
        "mov                    %0, %%"REG_a"       \n\t"
1605
        "1:                                         \n\t"
1606
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1607
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1608
        "pand                %%mm2, %%mm0           \n\t"
1609
        "pand                %%mm2, %%mm1           \n\t"
1610
        "packuswb            %%mm1, %%mm0           \n\t"
1611
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1612
        "add                    $8, %%"REG_a"       \n\t"
1613
        " js                    1b                  \n\t"
1614
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1615
        : "%"REG_a
1616
    );
1617
#else
1618
    int i;
1619
    for (i=0; i<width; i++)
1620
        dst[i]= src[2*i];
1621
#endif
1622
}
1623

    
1624
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1625
{
1626
#if COMPILE_TEMPLATE_MMX
1627
    __asm__ volatile(
1628
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1629
        "mov                    %0, %%"REG_a"       \n\t"
1630
        "1:                                         \n\t"
1631
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1632
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1633
        "psrlw                  $8, %%mm0           \n\t"
1634
        "psrlw                  $8, %%mm1           \n\t"
1635
        "packuswb            %%mm1, %%mm0           \n\t"
1636
        "movq                %%mm0, %%mm1           \n\t"
1637
        "psrlw                  $8, %%mm0           \n\t"
1638
        "pand                %%mm4, %%mm1           \n\t"
1639
        "packuswb            %%mm0, %%mm0           \n\t"
1640
        "packuswb            %%mm1, %%mm1           \n\t"
1641
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1642
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1643
        "add                    $4, %%"REG_a"       \n\t"
1644
        " js                    1b                  \n\t"
1645
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1646
        : "%"REG_a
1647
    );
1648
#else
1649
    int i;
1650
    for (i=0; i<width; i++) {
1651
        dstU[i]= src1[4*i + 1];
1652
        dstV[i]= src1[4*i + 3];
1653
    }
1654
#endif
1655
    assert(src1 == src2);
1656
}
1657

    
1658
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1659
{
1660
#if COMPILE_TEMPLATE_MMX
1661
    __asm__ volatile(
1662
        "mov                    %0, %%"REG_a"       \n\t"
1663
        "1:                                         \n\t"
1664
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1665
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1666
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1667
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1668
        "psrlw                  $8, %%mm0           \n\t"
1669
        "psrlw                  $8, %%mm1           \n\t"
1670
        "psrlw                  $8, %%mm2           \n\t"
1671
        "psrlw                  $8, %%mm3           \n\t"
1672
        "packuswb            %%mm1, %%mm0           \n\t"
1673
        "packuswb            %%mm3, %%mm2           \n\t"
1674
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1675
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1676
        "add                    $8, %%"REG_a"       \n\t"
1677
        " js                    1b                  \n\t"
1678
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1679
        : "%"REG_a
1680
    );
1681
#else
1682
    int i;
1683
    for (i=0; i<width; i++) {
1684
        dstU[i]= src1[2*i + 1];
1685
        dstV[i]= src2[2*i + 1];
1686
    }
1687
#endif
1688
}
1689

    
1690
/* This is almost identical to the previous, end exists only because
1691
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1692
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1693
{
1694
#if COMPILE_TEMPLATE_MMX
1695
    __asm__ volatile(
1696
        "mov                  %0, %%"REG_a"         \n\t"
1697
        "1:                                         \n\t"
1698
        "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1699
        "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1700
        "psrlw                $8, %%mm0             \n\t"
1701
        "psrlw                $8, %%mm1             \n\t"
1702
        "packuswb          %%mm1, %%mm0             \n\t"
1703
        "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1704
        "add                  $8, %%"REG_a"         \n\t"
1705
        " js                  1b                    \n\t"
1706
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1707
        : "%"REG_a
1708
    );
1709
#else
1710
    int i;
1711
    for (i=0; i<width; i++)
1712
        dst[i]= src[2*i+1];
1713
#endif
1714
}
1715

    
1716
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1717
{
1718
#if COMPILE_TEMPLATE_MMX
1719
    __asm__ volatile(
1720
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1721
        "mov                    %0, %%"REG_a"       \n\t"
1722
        "1:                                         \n\t"
1723
        "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1724
        "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1725
        "pand                %%mm4, %%mm0           \n\t"
1726
        "pand                %%mm4, %%mm1           \n\t"
1727
        "packuswb            %%mm1, %%mm0           \n\t"
1728
        "movq                %%mm0, %%mm1           \n\t"
1729
        "psrlw                  $8, %%mm0           \n\t"
1730
        "pand                %%mm4, %%mm1           \n\t"
1731
        "packuswb            %%mm0, %%mm0           \n\t"
1732
        "packuswb            %%mm1, %%mm1           \n\t"
1733
        "movd                %%mm0, (%3, %%"REG_a") \n\t"
1734
        "movd                %%mm1, (%2, %%"REG_a") \n\t"
1735
        "add                    $4, %%"REG_a"       \n\t"
1736
        " js                    1b                  \n\t"
1737
        : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1738
        : "%"REG_a
1739
    );
1740
#else
1741
    int i;
1742
    for (i=0; i<width; i++) {
1743
        dstU[i]= src1[4*i + 0];
1744
        dstV[i]= src1[4*i + 2];
1745
    }
1746
#endif
1747
    assert(src1 == src2);
1748
}
1749

    
1750
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1751
{
1752
#if COMPILE_TEMPLATE_MMX
1753
    __asm__ volatile(
1754
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1755
        "mov                    %0, %%"REG_a"       \n\t"
1756
        "1:                                         \n\t"
1757
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1758
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1759
        "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1760
        "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1761
        "pand                %%mm4, %%mm0           \n\t"
1762
        "pand                %%mm4, %%mm1           \n\t"
1763
        "pand                %%mm4, %%mm2           \n\t"
1764
        "pand                %%mm4, %%mm3           \n\t"
1765
        "packuswb            %%mm1, %%mm0           \n\t"
1766
        "packuswb            %%mm3, %%mm2           \n\t"
1767
        "movq                %%mm0, (%3, %%"REG_a") \n\t"
1768
        "movq                %%mm2, (%4, %%"REG_a") \n\t"
1769
        "add                    $8, %%"REG_a"       \n\t"
1770
        " js                    1b                  \n\t"
1771
        : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1772
        : "%"REG_a
1773
    );
1774
#else
1775
    int i;
1776
    for (i=0; i<width; i++) {
1777
        dstU[i]= src1[2*i];
1778
        dstV[i]= src2[2*i];
1779
    }
1780
#endif
1781
}
1782

    
1783
static inline void RENAME(nvXXtoUV)(uint8_t *dst1, uint8_t *dst2,
1784
                                    const uint8_t *src, long width)
1785
{
1786
#if COMPILE_TEMPLATE_MMX
1787
    __asm__ volatile(
1788
        "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1789
        "mov                    %0, %%"REG_a"       \n\t"
1790
        "1:                                         \n\t"
1791
        "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1792
        "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1793
        "movq                %%mm0, %%mm2           \n\t"
1794
        "movq                %%mm1, %%mm3           \n\t"
1795
        "pand                %%mm4, %%mm0           \n\t"
1796
        "pand                %%mm4, %%mm1           \n\t"
1797
        "psrlw                  $8, %%mm2           \n\t"
1798
        "psrlw                  $8, %%mm3           \n\t"
1799
        "packuswb            %%mm1, %%mm0           \n\t"
1800
        "packuswb            %%mm3, %%mm2           \n\t"
1801
        "movq                %%mm0, (%2, %%"REG_a") \n\t"
1802
        "movq                %%mm2, (%3, %%"REG_a") \n\t"
1803
        "add                    $8, %%"REG_a"       \n\t"
1804
        " js                    1b                  \n\t"
1805
        : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst1+width), "r" (dst2+width)
1806
        : "%"REG_a
1807
    );
1808
#else
1809
    int i;
1810
    for (i = 0; i < width; i++) {
1811
        dst1[i] = src[2*i+0];
1812
        dst2[i] = src[2*i+1];
1813
    }
1814
#endif
1815
}
1816

    
1817
static inline void RENAME(nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1818
                                    const uint8_t *src1, const uint8_t *src2,
1819
                                    long width, uint32_t *unused)
1820
{
1821
    RENAME(nvXXtoUV)(dstU, dstV, src1, width);
1822
}
1823

    
1824
static inline void RENAME(nv21ToUV)(uint8_t *dstU, uint8_t *dstV,
1825
                                    const uint8_t *src1, const uint8_t *src2,
1826
                                    long width, uint32_t *unused)
1827
{
1828
    RENAME(nvXXtoUV)(dstV, dstU, src1, width);
1829
}
1830

    
1831
#if COMPILE_TEMPLATE_MMX
1832
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, enum PixelFormat srcFormat)
1833
{
1834

    
1835
    if(srcFormat == PIX_FMT_BGR24) {
1836
        __asm__ volatile(
1837
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1838
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1839
            :
1840
        );
1841
    } else {
1842
        __asm__ volatile(
1843
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1844
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1845
            :
1846
        );
1847
    }
1848

    
1849
    __asm__ volatile(
1850
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1851
        "mov                        %2, %%"REG_a"   \n\t"
1852
        "pxor                    %%mm7, %%mm7       \n\t"
1853
        "1:                                         \n\t"
1854
        PREFETCH"               64(%0)              \n\t"
1855
        "movd                     (%0), %%mm0       \n\t"
1856
        "movd                    2(%0), %%mm1       \n\t"
1857
        "movd                    6(%0), %%mm2       \n\t"
1858
        "movd                    8(%0), %%mm3       \n\t"
1859
        "add                       $12, %0          \n\t"
1860
        "punpcklbw               %%mm7, %%mm0       \n\t"
1861
        "punpcklbw               %%mm7, %%mm1       \n\t"
1862
        "punpcklbw               %%mm7, %%mm2       \n\t"
1863
        "punpcklbw               %%mm7, %%mm3       \n\t"
1864
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1865
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1866
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1867
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1868
        "paddd                   %%mm1, %%mm0       \n\t"
1869
        "paddd                   %%mm3, %%mm2       \n\t"
1870
        "paddd                   %%mm4, %%mm0       \n\t"
1871
        "paddd                   %%mm4, %%mm2       \n\t"
1872
        "psrad                     $15, %%mm0       \n\t"
1873
        "psrad                     $15, %%mm2       \n\t"
1874
        "packssdw                %%mm2, %%mm0       \n\t"
1875
        "packuswb                %%mm0, %%mm0       \n\t"
1876
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1877
        "add                        $4, %%"REG_a"   \n\t"
1878
        " js                        1b              \n\t"
1879
    : "+r" (src)
1880
    : "r" (dst+width), "g" ((x86_reg)-width)
1881
    : "%"REG_a
1882
    );
1883
}
1884

    
1885
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, enum PixelFormat srcFormat)
1886
{
1887
    __asm__ volatile(
1888
        "movq                    24+%4, %%mm6       \n\t"
1889
        "mov                        %3, %%"REG_a"   \n\t"
1890
        "pxor                    %%mm7, %%mm7       \n\t"
1891
        "1:                                         \n\t"
1892
        PREFETCH"               64(%0)              \n\t"
1893
        "movd                     (%0), %%mm0       \n\t"
1894
        "movd                    2(%0), %%mm1       \n\t"
1895
        "punpcklbw               %%mm7, %%mm0       \n\t"
1896
        "punpcklbw               %%mm7, %%mm1       \n\t"
1897
        "movq                    %%mm0, %%mm2       \n\t"
1898
        "movq                    %%mm1, %%mm3       \n\t"
1899
        "pmaddwd                    %4, %%mm0       \n\t"
1900
        "pmaddwd                  8+%4, %%mm1       \n\t"
1901
        "pmaddwd                 16+%4, %%mm2       \n\t"
1902
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1903
        "paddd                   %%mm1, %%mm0       \n\t"
1904
        "paddd                   %%mm3, %%mm2       \n\t"
1905

    
1906
        "movd                    6(%0), %%mm1       \n\t"
1907
        "movd                    8(%0), %%mm3       \n\t"
1908
        "add                       $12, %0          \n\t"
1909
        "punpcklbw               %%mm7, %%mm1       \n\t"
1910
        "punpcklbw               %%mm7, %%mm3       \n\t"
1911
        "movq                    %%mm1, %%mm4       \n\t"
1912
        "movq                    %%mm3, %%mm5       \n\t"
1913
        "pmaddwd                    %4, %%mm1       \n\t"
1914
        "pmaddwd                  8+%4, %%mm3       \n\t"
1915
        "pmaddwd                 16+%4, %%mm4       \n\t"
1916
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1917
        "paddd                   %%mm3, %%mm1       \n\t"
1918
        "paddd                   %%mm5, %%mm4       \n\t"
1919

    
1920
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1921
        "paddd                   %%mm3, %%mm0       \n\t"
1922
        "paddd                   %%mm3, %%mm2       \n\t"
1923
        "paddd                   %%mm3, %%mm1       \n\t"
1924
        "paddd                   %%mm3, %%mm4       \n\t"
1925
        "psrad                     $15, %%mm0       \n\t"
1926
        "psrad                     $15, %%mm2       \n\t"
1927
        "psrad                     $15, %%mm1       \n\t"
1928
        "psrad                     $15, %%mm4       \n\t"
1929
        "packssdw                %%mm1, %%mm0       \n\t"
1930
        "packssdw                %%mm4, %%mm2       \n\t"
1931
        "packuswb                %%mm0, %%mm0       \n\t"
1932
        "packuswb                %%mm2, %%mm2       \n\t"
1933
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1934
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1935
        "add                        $4, %%"REG_a"   \n\t"
1936
        " js                        1b              \n\t"
1937
    : "+r" (src)
1938
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1939
    : "%"REG_a
1940
    );
1941
}
1942
#endif
1943

    
1944
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1945
{
1946
#if COMPILE_TEMPLATE_MMX
1947
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1948
#else
1949
    int i;
1950
    for (i=0; i<width; i++) {
1951
        int b= src[i*3+0];
1952
        int g= src[i*3+1];
1953
        int r= src[i*3+2];
1954

    
1955
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1956
    }
1957
#endif /* COMPILE_TEMPLATE_MMX */
1958
}
1959

    
1960
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1961
{
1962
#if COMPILE_TEMPLATE_MMX
1963
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1964
#else
1965
    int i;
1966
    for (i=0; i<width; i++) {
1967
        int b= src1[3*i + 0];
1968
        int g= src1[3*i + 1];
1969
        int r= src1[3*i + 2];
1970

    
1971
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1972
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1973
    }
1974
#endif /* COMPILE_TEMPLATE_MMX */
1975
    assert(src1 == src2);
1976
}
1977

    
1978
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1979
{
1980
    int i;
1981
    for (i=0; i<width; i++) {
1982
        int b= src1[6*i + 0] + src1[6*i + 3];
1983
        int g= src1[6*i + 1] + src1[6*i + 4];
1984
        int r= src1[6*i + 2] + src1[6*i + 5];
1985

    
1986
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1987
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1988
    }
1989
    assert(src1 == src2);
1990
}
1991

    
1992
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1993
{
1994
#if COMPILE_TEMPLATE_MMX
1995
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1996
#else
1997
    int i;
1998
    for (i=0; i<width; i++) {
1999
        int r= src[i*3+0];
2000
        int g= src[i*3+1];
2001
        int b= src[i*3+2];
2002

    
2003
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2004
    }
2005
#endif
2006
}
2007

    
2008
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2009
{
2010
#if COMPILE_TEMPLATE_MMX
2011
    assert(src1==src2);
2012
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
2013
#else
2014
    int i;
2015
    assert(src1==src2);
2016
    for (i=0; i<width; i++) {
2017
        int r= src1[3*i + 0];
2018
        int g= src1[3*i + 1];
2019
        int b= src1[3*i + 2];
2020

    
2021
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2022
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
2023
    }
2024
#endif
2025
}
2026

    
2027
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2028
{
2029
    int i;
2030
    assert(src1==src2);
2031
    for (i=0; i<width; i++) {
2032
        int r= src1[6*i + 0] + src1[6*i + 3];
2033
        int g= src1[6*i + 1] + src1[6*i + 4];
2034
        int b= src1[6*i + 2] + src1[6*i + 5];
2035

    
2036
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2037
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2038
    }
2039
}
2040

    
2041

    
2042
// bilinear / bicubic scaling
2043
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2044
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2045
{
2046
#if COMPILE_TEMPLATE_MMX
2047
    assert(filterSize % 4 == 0 && filterSize>0);
2048
    if (filterSize==4) { // Always true for upscaling, sometimes for down, too.
2049
        x86_reg counter= -2*dstW;
2050
        filter-= counter*2;
2051
        filterPos-= counter/2;
2052
        dst-= counter/2;
2053
        __asm__ volatile(
2054
#if defined(PIC)
2055
            "push            %%"REG_b"              \n\t"
2056
#endif
2057
            "pxor                %%mm7, %%mm7       \n\t"
2058
            "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2059
            "mov             %%"REG_a", %%"REG_BP"  \n\t"
2060
            ASMALIGN(4)
2061
            "1:                                     \n\t"
2062
            "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2063
            "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2064
            "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2065
            "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2066
            "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2067
            "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2068
            "punpcklbw           %%mm7, %%mm0       \n\t"
2069
            "punpcklbw           %%mm7, %%mm2       \n\t"
2070
            "pmaddwd             %%mm1, %%mm0       \n\t"
2071
            "pmaddwd             %%mm2, %%mm3       \n\t"
2072
            "movq                %%mm0, %%mm4       \n\t"
2073
            "punpckldq           %%mm3, %%mm0       \n\t"
2074
            "punpckhdq           %%mm3, %%mm4       \n\t"
2075
            "paddd               %%mm4, %%mm0       \n\t"
2076
            "psrad                  $7, %%mm0       \n\t"
2077
            "packssdw            %%mm0, %%mm0       \n\t"
2078
            "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2079
            "add                    $4, %%"REG_BP"  \n\t"
2080
            " jnc                   1b              \n\t"
2081

    
2082
            "pop            %%"REG_BP"              \n\t"
2083
#if defined(PIC)
2084
            "pop             %%"REG_b"              \n\t"
2085
#endif
2086
            : "+a" (counter)
2087
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2088
#if !defined(PIC)
2089
            : "%"REG_b
2090
#endif
2091
        );
2092
    } else if (filterSize==8) {
2093
        x86_reg counter= -2*dstW;
2094
        filter-= counter*4;
2095
        filterPos-= counter/2;
2096
        dst-= counter/2;
2097
        __asm__ volatile(
2098
#if defined(PIC)
2099
            "push             %%"REG_b"             \n\t"
2100
#endif
2101
            "pxor                 %%mm7, %%mm7      \n\t"
2102
            "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2103
            "mov              %%"REG_a", %%"REG_BP" \n\t"
2104
            ASMALIGN(4)
2105
            "1:                                     \n\t"
2106
            "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2107
            "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2108
            "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2109
            "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2110
            "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2111
            "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2112
            "punpcklbw            %%mm7, %%mm0      \n\t"
2113
            "punpcklbw            %%mm7, %%mm2      \n\t"
2114
            "pmaddwd              %%mm1, %%mm0      \n\t"
2115
            "pmaddwd              %%mm2, %%mm3      \n\t"
2116

    
2117
            "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2118
            "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2119
            "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2120
            "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2121
            "punpcklbw            %%mm7, %%mm4      \n\t"
2122
            "punpcklbw            %%mm7, %%mm2      \n\t"
2123
            "pmaddwd              %%mm1, %%mm4      \n\t"
2124
            "pmaddwd              %%mm2, %%mm5      \n\t"
2125
            "paddd                %%mm4, %%mm0      \n\t"
2126
            "paddd                %%mm5, %%mm3      \n\t"
2127
            "movq                 %%mm0, %%mm4      \n\t"
2128
            "punpckldq            %%mm3, %%mm0      \n\t"
2129
            "punpckhdq            %%mm3, %%mm4      \n\t"
2130
            "paddd                %%mm4, %%mm0      \n\t"
2131
            "psrad                   $7, %%mm0      \n\t"
2132
            "packssdw             %%mm0, %%mm0      \n\t"
2133
            "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2134
            "add                     $4, %%"REG_BP" \n\t"
2135
            " jnc                    1b             \n\t"
2136

    
2137
            "pop             %%"REG_BP"             \n\t"
2138
#if defined(PIC)
2139
            "pop              %%"REG_b"             \n\t"
2140
#endif
2141
            : "+a" (counter)
2142
            : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2143
#if !defined(PIC)
2144
            : "%"REG_b
2145
#endif
2146
        );
2147
    } else {
2148
        uint8_t *offset = src+filterSize;
2149
        x86_reg counter= -2*dstW;
2150
        //filter-= counter*filterSize/2;
2151
        filterPos-= counter/2;
2152
        dst-= counter/2;
2153
        __asm__ volatile(
2154
            "pxor                  %%mm7, %%mm7     \n\t"
2155
            ASMALIGN(4)
2156
            "1:                                     \n\t"
2157
            "mov                      %2, %%"REG_c" \n\t"
2158
            "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2159
            "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2160
            "mov                      %5, %%"REG_c" \n\t"
2161
            "pxor                  %%mm4, %%mm4     \n\t"
2162
            "pxor                  %%mm5, %%mm5     \n\t"
2163
            "2:                                     \n\t"
2164
            "movq                   (%1), %%mm1     \n\t"
2165
            "movq               (%1, %6), %%mm3     \n\t"
2166
            "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2167
            "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2168
            "punpcklbw             %%mm7, %%mm0     \n\t"
2169
            "punpcklbw             %%mm7, %%mm2     \n\t"
2170
            "pmaddwd               %%mm1, %%mm0     \n\t"
2171
            "pmaddwd               %%mm2, %%mm3     \n\t"
2172
            "paddd                 %%mm3, %%mm5     \n\t"
2173
            "paddd                 %%mm0, %%mm4     \n\t"
2174
            "add                      $8, %1        \n\t"
2175
            "add                      $4, %%"REG_c" \n\t"
2176
            "cmp                      %4, %%"REG_c" \n\t"
2177
            " jb                      2b            \n\t"
2178
            "add                      %6, %1        \n\t"
2179
            "movq                  %%mm4, %%mm0     \n\t"
2180
            "punpckldq             %%mm5, %%mm4     \n\t"
2181
            "punpckhdq             %%mm5, %%mm0     \n\t"
2182
            "paddd                 %%mm0, %%mm4     \n\t"
2183
            "psrad                    $7, %%mm4     \n\t"
2184
            "packssdw              %%mm4, %%mm4     \n\t"
2185
            "mov                      %3, %%"REG_a" \n\t"
2186
            "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2187
            "add                      $4, %0        \n\t"
2188
            " jnc                     1b            \n\t"
2189

    
2190
            : "+r" (counter), "+r" (filter)
2191
            : "m" (filterPos), "m" (dst), "m"(offset),
2192
            "m" (src), "r" ((x86_reg)filterSize*2)
2193
            : "%"REG_a, "%"REG_c, "%"REG_d
2194
        );
2195
    }
2196
#else
2197
#if COMPILE_TEMPLATE_ALTIVEC
2198
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2199
#else
2200
    int i;
2201
    for (i=0; i<dstW; i++) {
2202
        int j;
2203
        int srcPos= filterPos[i];
2204
        int val=0;
2205
        //printf("filterPos: %d\n", filterPos[i]);
2206
        for (j=0; j<filterSize; j++) {
2207
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2208
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2209
        }
2210
        //filter += hFilterSize;
2211
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2212
        //dst[i] = val>>7;
2213
    }
2214
#endif /* COMPILE_ALTIVEC */
2215
#endif /* COMPILE_MMX */
2216
}
2217

    
2218
//FIXME all pal and rgb srcFormats could do this convertion as well
2219
//FIXME all scalers more complex than bilinear could do half of this transform
2220
static void RENAME(chrRangeToJpeg)(uint16_t *dst, int width)
2221
{
2222
    int i;
2223
    for (i = 0; i < width; i++) {
2224
        dst[i     ] = (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2225
        dst[i+VOFW] = (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2226
    }
2227
}
2228
static void RENAME(chrRangeFromJpeg)(uint16_t *dst, int width)
2229
{
2230
    int i;
2231
    for (i = 0; i < width; i++) {
2232
        dst[i     ] = (dst[i     ]*1799 + 4081085)>>11; //1469
2233
        dst[i+VOFW] = (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2234
    }
2235
}
2236
static void RENAME(lumRangeToJpeg)(uint16_t *dst, int width)
2237
{
2238
    int i;
2239
    for (i = 0; i < width; i++)
2240
        dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2241
}
2242
static void RENAME(lumRangeFromJpeg)(uint16_t *dst, int width)
2243
{
2244
    int i;
2245
    for (i = 0; i < width; i++)
2246
        dst[i] = (dst[i]*14071 + 33561947)>>14;
2247
}
2248

    
2249
#define FAST_BILINEAR_X86 \
2250
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2251
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2252
    "shll      $16, %%edi    \n\t"                                              \
2253
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2254
    "mov        %1, %%"REG_D"\n\t"                                              \
2255
    "shrl       $9, %%esi    \n\t"                                              \
2256

    
2257
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2258
                                        long dstWidth, const uint8_t *src, int srcW,
2259
                                        int xInc)
2260
{
2261
#if ARCH_X86 && CONFIG_GPL
2262
#if COMPILE_TEMPLATE_MMX2
2263
    int32_t *mmx2FilterPos = c->lumMmx2FilterPos;
2264
    int16_t *mmx2Filter    = c->lumMmx2Filter;
2265
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2266
    void    *mmx2FilterCode= c->lumMmx2FilterCode;
2267
    int i;
2268
#if defined(PIC)
2269
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270
#endif
2271
    if (canMMX2BeUsed) {
2272
        __asm__ volatile(
2273
#if defined(PIC)
2274
            "mov               %%"REG_b", %5        \n\t"
2275
#endif
2276
            "pxor                  %%mm7, %%mm7     \n\t"
2277
            "mov                      %0, %%"REG_c" \n\t"
2278
            "mov                      %1, %%"REG_D" \n\t"
2279
            "mov                      %2, %%"REG_d" \n\t"
2280
            "mov                      %3, %%"REG_b" \n\t"
2281
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2282
            PREFETCH"        (%%"REG_c")            \n\t"
2283
            PREFETCH"      32(%%"REG_c")            \n\t"
2284
            PREFETCH"      64(%%"REG_c")            \n\t"
2285

    
2286
#if ARCH_X86_64
2287

    
2288
#define CALL_MMX2_FILTER_CODE \
2289
            "movl            (%%"REG_b"), %%esi     \n\t"\
2290
            "call                    *%4            \n\t"\
2291
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2292
            "add               %%"REG_S", %%"REG_c" \n\t"\
2293
            "add               %%"REG_a", %%"REG_D" \n\t"\
2294
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2295

    
2296
#else
2297

    
2298
#define CALL_MMX2_FILTER_CODE \
2299
            "movl (%%"REG_b"), %%esi        \n\t"\
2300
            "call         *%4                       \n\t"\
2301
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2302
            "add               %%"REG_a", %%"REG_D" \n\t"\
2303
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2304

    
2305
#endif /* ARCH_X86_64 */
2306

    
2307
            CALL_MMX2_FILTER_CODE
2308
            CALL_MMX2_FILTER_CODE
2309
            CALL_MMX2_FILTER_CODE
2310
            CALL_MMX2_FILTER_CODE
2311
            CALL_MMX2_FILTER_CODE
2312
            CALL_MMX2_FILTER_CODE
2313
            CALL_MMX2_FILTER_CODE
2314
            CALL_MMX2_FILTER_CODE
2315

    
2316
#if defined(PIC)
2317
            "mov                      %5, %%"REG_b" \n\t"
2318
#endif
2319
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2320
            "m" (mmx2FilterCode)
2321
#if defined(PIC)
2322
            ,"m" (ebxsave)
2323
#endif
2324
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2325
#if !defined(PIC)
2326
            ,"%"REG_b
2327
#endif
2328
        );
2329
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2330
    } else {
2331
#endif /* COMPILE_TEMPLATE_MMX2 */
2332
    x86_reg xInc_shr16 = xInc >> 16;
2333
    uint16_t xInc_mask = xInc & 0xffff;
2334
    //NO MMX just normal asm ...
2335
    __asm__ volatile(
2336
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2337
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2338
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2339
        ASMALIGN(4)
2340
        "1:                                  \n\t"
2341
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2342
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2343
        FAST_BILINEAR_X86
2344
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2345
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2346
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2347

    
2348
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2349
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2350
        FAST_BILINEAR_X86
2351
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2352
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2353
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2354

    
2355

    
2356
        "add        $2, %%"REG_a"            \n\t"
2357
        "cmp        %2, %%"REG_a"            \n\t"
2358
        " jb        1b                       \n\t"
2359

    
2360

    
2361
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2362
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2363
    );
2364
#if COMPILE_TEMPLATE_MMX2
2365
    } //if MMX2 can't be used
2366
#endif
2367
#else
2368
    int i;
2369
    unsigned int xpos=0;
2370
    for (i=0;i<dstWidth;i++) {
2371
        register unsigned int xx=xpos>>16;
2372
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2373
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2374
        xpos+=xInc;
2375
    }
2376
#endif /* ARCH_X86 */
2377
}
2378

    
2379
      // *** horizontal scale Y line to temp buffer
2380
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2381
                                   const int16_t *hLumFilter,
2382
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2383
                                   uint8_t *formatConvBuffer,
2384
                                   uint32_t *pal, int isAlpha)
2385
{
2386
    void (*toYV12)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2387
    void (*convertRange)(uint16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2388

    
2389
    src += isAlpha ? c->alpSrcOffset : c->lumSrcOffset;
2390

    
2391
    if (toYV12) {
2392
        toYV12(formatConvBuffer, src, srcW, pal);
2393
        src= formatConvBuffer;
2394
    }
2395

    
2396
    if (!c->hyscale_fast)
2397
    {
2398
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2399
    } else { // fast bilinear upscale / crap downscale
2400
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2401
    }
2402

    
2403
    if (convertRange)
2404
        convertRange(dst, dstWidth);
2405
}
2406

    
2407
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2408
                                        long dstWidth, const uint8_t *src1,
2409
                                        const uint8_t *src2, int srcW, int xInc)
2410
{
2411
#if ARCH_X86 && CONFIG_GPL
2412
#if COMPILE_TEMPLATE_MMX2
2413
    int32_t *mmx2FilterPos = c->chrMmx2FilterPos;
2414
    int16_t *mmx2Filter    = c->chrMmx2Filter;
2415
    int     canMMX2BeUsed  = c->canMMX2BeUsed;
2416
    void    *mmx2FilterCode= c->chrMmx2FilterCode;
2417
    int i;
2418
#if defined(PIC)
2419
    DECLARE_ALIGNED(8, uint64_t, ebxsave);
2420
#endif
2421
    if (canMMX2BeUsed) {
2422
        __asm__ volatile(
2423
#if defined(PIC)
2424
            "mov          %%"REG_b", %6         \n\t"
2425
#endif
2426
            "pxor             %%mm7, %%mm7      \n\t"
2427
            "mov                 %0, %%"REG_c"  \n\t"
2428
            "mov                 %1, %%"REG_D"  \n\t"
2429
            "mov                 %2, %%"REG_d"  \n\t"
2430
            "mov                 %3, %%"REG_b"  \n\t"
2431
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2432
            PREFETCH"   (%%"REG_c")             \n\t"
2433
            PREFETCH" 32(%%"REG_c")             \n\t"
2434
            PREFETCH" 64(%%"REG_c")             \n\t"
2435

    
2436
            CALL_MMX2_FILTER_CODE
2437
            CALL_MMX2_FILTER_CODE
2438
            CALL_MMX2_FILTER_CODE
2439
            CALL_MMX2_FILTER_CODE
2440
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2441
            "mov                 %5, %%"REG_c"  \n\t" // src
2442
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2443
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2444
            PREFETCH"   (%%"REG_c")             \n\t"
2445
            PREFETCH" 32(%%"REG_c")             \n\t"
2446
            PREFETCH" 64(%%"REG_c")             \n\t"
2447

    
2448
            CALL_MMX2_FILTER_CODE
2449
            CALL_MMX2_FILTER_CODE
2450
            CALL_MMX2_FILTER_CODE
2451
            CALL_MMX2_FILTER_CODE
2452

    
2453
#if defined(PIC)
2454
            "mov %6, %%"REG_b"    \n\t"
2455
#endif
2456
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2457
            "m" (mmx2FilterCode), "m" (src2)
2458
#if defined(PIC)
2459
            ,"m" (ebxsave)
2460
#endif
2461
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2462
#if !defined(PIC)
2463
            ,"%"REG_b
2464
#endif
2465
        );
2466
        for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2467
            //printf("%d %d %d\n", dstWidth, i, srcW);
2468
            dst[i] = src1[srcW-1]*128;
2469
            dst[i+VOFW] = src2[srcW-1]*128;
2470
        }
2471
    } else {
2472
#endif /* COMPILE_TEMPLATE_MMX2 */
2473
        x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2474
        uint16_t xInc_mask = xInc & 0xffff;
2475
        __asm__ volatile(
2476
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2477
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2478
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2479
            ASMALIGN(4)
2480
            "1:                                     \n\t"
2481
            "mov        %0, %%"REG_S"               \n\t"
2482
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2483
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2484
            FAST_BILINEAR_X86
2485
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2486

    
2487
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2488
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2489
            FAST_BILINEAR_X86
2490
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2491

    
2492
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2493
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2494
            "add        $1, %%"REG_a"               \n\t"
2495
            "cmp        %2, %%"REG_a"               \n\t"
2496
            " jb        1b                          \n\t"
2497

    
2498
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2499
which is needed to support GCC 4.0. */
2500
#if ARCH_X86_64 && AV_GCC_VERSION_AT_LEAST(3,4)
2501
            :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2502
#else
2503
            :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2504
#endif
2505
            "r" (src2)
2506
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2507
        );
2508
#if COMPILE_TEMPLATE_MMX2
2509
    } //if MMX2 can't be used
2510
#endif
2511
#else
2512
    int i;
2513
    unsigned int xpos=0;
2514
    for (i=0;i<dstWidth;i++) {
2515
        register unsigned int xx=xpos>>16;
2516
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2517
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2518
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2519
        /* slower
2520
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2521
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2522
        */
2523
        xpos+=xInc;
2524
    }
2525
#endif /* ARCH_X86 */
2526
}
2527

    
2528
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2529
                                   int srcW, int xInc, const int16_t *hChrFilter,
2530
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2531
                                   uint8_t *formatConvBuffer,
2532
                                   uint32_t *pal)
2533
{
2534

    
2535
    src1 += c->chrSrcOffset;
2536
    src2 += c->chrSrcOffset;
2537

    
2538
    if (c->chrToYV12) {
2539
        c->chrToYV12(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2540
        src1= formatConvBuffer;
2541
        src2= formatConvBuffer+VOFW;
2542
    }
2543

    
2544
    if (!c->hcscale_fast)
2545
    {
2546
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2547
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2548
    } else { // fast bilinear upscale / crap downscale
2549
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2550
    }
2551

    
2552
    if (c->chrConvertRange)
2553
        c->chrConvertRange(dst, dstWidth);
2554
}
2555

    
2556
#define DEBUG_SWSCALE_BUFFERS 0
2557
#define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2558

    
2559
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2560
                           int srcSliceH, uint8_t* dst[], int dstStride[])
2561
{
2562
    /* load a few things into local vars to make the code more readable? and faster */
2563
    const int srcW= c->srcW;
2564
    const int dstW= c->dstW;
2565
    const int dstH= c->dstH;
2566
    const int chrDstW= c->chrDstW;
2567
    const int chrSrcW= c->chrSrcW;
2568
    const int lumXInc= c->lumXInc;
2569
    const int chrXInc= c->chrXInc;
2570
    const enum PixelFormat dstFormat= c->dstFormat;
2571
    const int flags= c->flags;
2572
    int16_t *vLumFilterPos= c->vLumFilterPos;
2573
    int16_t *vChrFilterPos= c->vChrFilterPos;
2574
    int16_t *hLumFilterPos= c->hLumFilterPos;
2575
    int16_t *hChrFilterPos= c->hChrFilterPos;
2576
    int16_t *vLumFilter= c->vLumFilter;
2577
    int16_t *vChrFilter= c->vChrFilter;
2578
    int16_t *hLumFilter= c->hLumFilter;
2579
    int16_t *hChrFilter= c->hChrFilter;
2580
    int32_t *lumMmxFilter= c->lumMmxFilter;
2581
    int32_t *chrMmxFilter= c->chrMmxFilter;
2582
    int32_t *alpMmxFilter= c->alpMmxFilter;
2583
    const int vLumFilterSize= c->vLumFilterSize;
2584
    const int vChrFilterSize= c->vChrFilterSize;
2585
    const int hLumFilterSize= c->hLumFilterSize;
2586
    const int hChrFilterSize= c->hChrFilterSize;
2587
    int16_t **lumPixBuf= c->lumPixBuf;
2588
    int16_t **chrPixBuf= c->chrPixBuf;
2589
    int16_t **alpPixBuf= c->alpPixBuf;
2590
    const int vLumBufSize= c->vLumBufSize;
2591
    const int vChrBufSize= c->vChrBufSize;
2592
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2593
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2594
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2595
    int lastDstY;
2596
    uint32_t *pal=c->pal_yuv;
2597

    
2598
    /* vars which will change and which we need to store back in the context */
2599
    int dstY= c->dstY;
2600
    int lumBufIndex= c->lumBufIndex;
2601
    int chrBufIndex= c->chrBufIndex;
2602
    int lastInLumBuf= c->lastInLumBuf;
2603
    int lastInChrBuf= c->lastInChrBuf;
2604

    
2605
    if (isPacked(c->srcFormat)) {
2606
        src[0]=
2607
        src[1]=
2608
        src[2]=
2609
        src[3]= src[0];
2610
        srcStride[0]=
2611
        srcStride[1]=
2612
        srcStride[2]=
2613
        srcStride[3]= srcStride[0];
2614
    }
2615
    srcStride[1]<<= c->vChrDrop;
2616
    srcStride[2]<<= c->vChrDrop;
2617

    
2618
    DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2619
                  src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2620
                  dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2621
    DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2622
                   srcSliceY,    srcSliceH,    dstY,    dstH);
2623
    DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2624
                   vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2625

    
2626
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2627
        static int warnedAlready=0; //FIXME move this into the context perhaps
2628
        if (flags & SWS_PRINT_INFO && !warnedAlready) {
2629
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2630
                   "         ->cannot do aligned memory accesses anymore\n");
2631
            warnedAlready=1;
2632
        }
2633
    }
2634

    
2635
    /* Note the user might start scaling the picture in the middle so this
2636
       will not get executed. This is not really intended but works
2637
       currently, so people might do it. */
2638
    if (srcSliceY ==0) {
2639
        lumBufIndex=-1;
2640
        chrBufIndex=-1;
2641
        dstY=0;
2642
        lastInLumBuf= -1;
2643
        lastInChrBuf= -1;
2644
    }
2645

    
2646
    lastDstY= dstY;
2647

    
2648
    for (;dstY < dstH; dstY++) {
2649
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2650
        const int chrDstY= dstY>>c->chrDstVSubSample;
2651
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2652
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2653
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2654

    
2655
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2656
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2657
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2658
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2659
        int enough_lines;
2660

    
2661
        //handle holes (FAST_BILINEAR & weird filters)
2662
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2663
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2664
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2665
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2666

    
2667
        // Do we have enough lines in this slice to output the dstY line
2668
        enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2669
        if (!enough_lines) {
2670
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2671
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2672
        }
2673

    
2674
        DEBUG_BUFFERS("dstY: %d\n", dstY);
2675
        DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2676
                         firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2677
        DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2678
                         firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2679

    
2680
        //Do horizontal scaling
2681
        while(lastInLumBuf < lastLumSrcY) {
2682
            uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2683
            uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2684
            lumBufIndex++;
2685
            DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2686
                               lumBufIndex,    lastInLumBuf);
2687
            assert(lumBufIndex < 2*vLumBufSize);
2688
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2689
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2690
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2691
                            hLumFilter, hLumFilterPos, hLumFilterSize,
2692
                            formatConvBuffer,
2693
                            pal, 0);
2694
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2695
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2696
                                hLumFilter, hLumFilterPos, hLumFilterSize,
2697
                                formatConvBuffer,
2698
                                pal, 1);
2699
            lastInLumBuf++;
2700
        }
2701
        while(lastInChrBuf < lastChrSrcY) {
2702
            uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2703
            uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2704
            chrBufIndex++;
2705
            DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2706
                               chrBufIndex,    lastInChrBuf);
2707
            assert(chrBufIndex < 2*vChrBufSize);
2708
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2709
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2710
            //FIXME replace parameters through context struct (some at least)
2711

    
2712
            if (c->needs_hcscale)
2713
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2714
                                hChrFilter, hChrFilterPos, hChrFilterSize,
2715
                                formatConvBuffer,
2716
                                pal);
2717
            lastInChrBuf++;
2718
        }
2719
        //wrap buf index around to stay inside the ring buffer
2720
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2721
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2722
        if (!enough_lines)
2723
            break; //we can't output a dstY line so let's try with the next slice
2724

    
2725
#if COMPILE_TEMPLATE_MMX
2726
        c->blueDither= ff_dither8[dstY&1];
2727
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2728
            c->greenDither= ff_dither8[dstY&1];
2729
        else
2730
            c->greenDither= ff_dither4[dstY&1];
2731
        c->redDither= ff_dither8[(dstY+1)&1];
2732
#endif
2733
        if (dstY < dstH-2) {
2734
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2735
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2736
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2737
#if COMPILE_TEMPLATE_MMX
2738
            int i;
2739
            if (flags & SWS_ACCURATE_RND) {
2740
                int s= APCK_SIZE / 8;
2741
                for (i=0; i<vLumFilterSize; i+=2) {
2742
                    *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2743
                    *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2744
                              lumMmxFilter[s*i+APCK_COEF/4  ]=
2745
                              lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2746
                        + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2747
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2748
                        *(void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2749
                        *(void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2750
                                  alpMmxFilter[s*i+APCK_COEF/4  ]=
2751
                                  alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2752
                    }
2753
                }
2754
                for (i=0; i<vChrFilterSize; i+=2) {
2755
                    *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2756
                    *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2757
                              chrMmxFilter[s*i+APCK_COEF/4  ]=
2758
                              chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2759
                        + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2760
                }
2761
            } else {
2762
                for (i=0; i<vLumFilterSize; i++) {
2763
                    lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2764
                    lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2765
                    lumMmxFilter[4*i+2]=
2766
                    lumMmxFilter[4*i+3]=
2767
                        ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2768
                    if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
2769
                        alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2770
                        alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2771
                        alpMmxFilter[4*i+2]=
2772
                        alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2773
                    }
2774
                }
2775
                for (i=0; i<vChrFilterSize; i++) {
2776
                    chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2777
                    chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2778
                    chrMmxFilter[4*i+2]=
2779
                    chrMmxFilter[4*i+3]=
2780
                        ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2781
                }
2782
            }
2783
#endif
2784
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2785
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2786
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2787
                c->yuv2nv12X(c,
2788
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2789
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2790
                             dest, uDest, dstW, chrDstW, dstFormat);
2791
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2792
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2793
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2794
                if (is16BPS(dstFormat)) {
2795
                    yuv2yuvX16inC(
2796
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2797
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2798
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2799
                                  dstFormat);
2800
                } else if (vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2801
                    int16_t *lumBuf = lumSrcPtr[0];
2802
                    int16_t *chrBuf= chrSrcPtr[0];
2803
                    int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2804
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2805
                } else { //General YV12
2806
                    c->yuv2yuvX(c,
2807
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2808
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2809
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2810
                }
2811
            } else {
2812
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2813
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2814
                if (vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2815
                    int chrAlpha= vChrFilter[2*dstY+1];
2816
                    if(flags & SWS_FULL_CHR_H_INT) {
2817
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2818
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2819
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2820
                                         alpSrcPtr, dest, dstW, dstY);
2821
                    } else {
2822
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2823
                                       alpPixBuf ? *alpSrcPtr : NULL,
2824
                                       dest, dstW, chrAlpha, dstFormat, flags, dstY);
2825
                    }
2826
                } else if (vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2827
                    int lumAlpha= vLumFilter[2*dstY+1];
2828
                    int chrAlpha= vChrFilter[2*dstY+1];
2829
                    lumMmxFilter[2]=
2830
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2831
                    chrMmxFilter[2]=
2832
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2833
                    if(flags & SWS_FULL_CHR_H_INT) {
2834
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2835
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2836
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2837
                                         alpSrcPtr, dest, dstW, dstY);
2838
                    } else {
2839
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2840
                                       alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2841
                                       dest, dstW, lumAlpha, chrAlpha, dstY);
2842
                    }
2843
                } else { //general RGB
2844
                    if(flags & SWS_FULL_CHR_H_INT) {
2845
                        yuv2rgbXinC_full(c,
2846
                                         vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2847
                                         vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2848
                                         alpSrcPtr, dest, dstW, dstY);
2849
                    } else {
2850
                        c->yuv2packedX(c,
2851
                                       vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2852
                                       vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2853
                                       alpSrcPtr, dest, dstW, dstY);
2854
                    }
2855
                }
2856
            }
2857
        } else { // hmm looks like we can't use MMX here without overwriting this array's tail
2858
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2859
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2860
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2861
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2862
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2863
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2864
                yuv2nv12XinC(
2865
                             vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2866
                             vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2867
                             dest, uDest, dstW, chrDstW, dstFormat);
2868
            } else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12
2869
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2870
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2871
                if (is16BPS(dstFormat)) {
2872
                    yuv2yuvX16inC(
2873
                                  vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2874
                                  vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2875
                                  alpSrcPtr, (uint16_t *) dest, (uint16_t *) uDest, (uint16_t *) vDest, (uint16_t *) aDest, dstW, chrDstW,
2876
                                  dstFormat);
2877
                } else {
2878
                    yuv2yuvXinC(
2879
                                vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2880
                                vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2881
                                alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2882
                }
2883
            } else {
2884
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2885
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2886
                if(flags & SWS_FULL_CHR_H_INT) {
2887
                    yuv2rgbXinC_full(c,
2888
                                     vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2889
                                     vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2890
                                     alpSrcPtr, dest, dstW, dstY);
2891
                } else {
2892
                    yuv2packedXinC(c,
2893
                                   vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2894
                                   vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895
                                   alpSrcPtr, dest, dstW, dstY);
2896
                }
2897
            }
2898
        }
2899
    }
2900

    
2901
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2902
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2903

    
2904
#if COMPILE_TEMPLATE_MMX
2905
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2906
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2907
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2908
    else                             __asm__ volatile("emms"  :::"memory");
2909
#endif
2910
    /* store changed local vars back in the context */
2911
    c->dstY= dstY;
2912
    c->lumBufIndex= lumBufIndex;
2913
    c->chrBufIndex= chrBufIndex;
2914
    c->lastInLumBuf= lastInLumBuf;
2915
    c->lastInChrBuf= lastInChrBuf;
2916

    
2917
    return dstY - lastDstY;
2918
}
2919

    
2920
static void RENAME(sws_init_swScale)(SwsContext *c)
2921
{
2922
    enum PixelFormat srcFormat = c->srcFormat;
2923

    
2924
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2925
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2926
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2927
    c->yuv2packed1  = RENAME(yuv2packed1 );
2928
    c->yuv2packed2  = RENAME(yuv2packed2 );
2929
    c->yuv2packedX  = RENAME(yuv2packedX );
2930

    
2931
    c->hScale       = RENAME(hScale      );
2932

    
2933
#if COMPILE_TEMPLATE_MMX
2934
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2935
    if (c->flags & SWS_FAST_BILINEAR && c->canMMX2BeUsed)
2936
#else
2937
    if (c->flags & SWS_FAST_BILINEAR)
2938
#endif
2939
    {
2940
        c->hyscale_fast = RENAME(hyscale_fast);
2941
        c->hcscale_fast = RENAME(hcscale_fast);
2942
    }
2943

    
2944
    c->chrToYV12 = NULL;
2945
    switch(srcFormat) {
2946
        case PIX_FMT_YUYV422  : c->chrToYV12 = RENAME(yuy2ToUV); break;
2947
        case PIX_FMT_UYVY422  : c->chrToYV12 = RENAME(uyvyToUV); break;
2948
        case PIX_FMT_NV12     : c->chrToYV12 = RENAME(nv12ToUV); break;
2949
        case PIX_FMT_NV21     : c->chrToYV12 = RENAME(nv21ToUV); break;
2950
        case PIX_FMT_RGB8     :
2951
        case PIX_FMT_BGR8     :
2952
        case PIX_FMT_PAL8     :
2953
        case PIX_FMT_BGR4_BYTE:
2954
        case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV; break;
2955
        case PIX_FMT_YUV420P16BE:
2956
        case PIX_FMT_YUV422P16BE:
2957
        case PIX_FMT_YUV444P16BE: c->chrToYV12 = RENAME(BEToUV); break;
2958
        case PIX_FMT_YUV420P16LE:
2959
        case PIX_FMT_YUV422P16LE:
2960
        case PIX_FMT_YUV444P16LE: c->chrToYV12 = RENAME(LEToUV); break;
2961
    }
2962
    if (c->chrSrcHSubSample) {
2963
        switch(srcFormat) {
2964
        case PIX_FMT_RGB48BE:
2965
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV_half; break;
2966
        case PIX_FMT_RGB32  :
2967
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV_half; break;
2968
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV_half); break;
2969
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV_half; break;
2970
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV_half; break;
2971
        case PIX_FMT_BGR32  :
2972
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV_half; break;
2973
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV_half); break;
2974
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV_half; break;
2975
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV_half; break;
2976
        }
2977
    } else {
2978
        switch(srcFormat) {
2979
        case PIX_FMT_RGB48BE:
2980
        case PIX_FMT_RGB48LE: c->chrToYV12 = rgb48ToUV; break;
2981
        case PIX_FMT_RGB32  :
2982
        case PIX_FMT_RGB32_1: c->chrToYV12 = bgr32ToUV; break;
2983
        case PIX_FMT_BGR24  : c->chrToYV12 = RENAME(bgr24ToUV); break;
2984
        case PIX_FMT_BGR565 : c->chrToYV12 = bgr16ToUV; break;
2985
        case PIX_FMT_BGR555 : c->chrToYV12 = bgr15ToUV; break;
2986
        case PIX_FMT_BGR32  :
2987
        case PIX_FMT_BGR32_1: c->chrToYV12 = rgb32ToUV; break;
2988
        case PIX_FMT_RGB24  : c->chrToYV12 = RENAME(rgb24ToUV); break;
2989
        case PIX_FMT_RGB565 : c->chrToYV12 = rgb16ToUV; break;
2990
        case PIX_FMT_RGB555 : c->chrToYV12 = rgb15ToUV; break;
2991
        }
2992
    }
2993

    
2994
    c->lumToYV12 = NULL;
2995
    c->alpToYV12 = NULL;
2996
    switch (srcFormat) {
2997
    case PIX_FMT_YUYV422  :
2998
    case PIX_FMT_YUV420P16BE:
2999
    case PIX_FMT_YUV422P16BE:
3000
    case PIX_FMT_YUV444P16BE:
3001
    case PIX_FMT_GRAY16BE : c->lumToYV12 = RENAME(yuy2ToY); break;
3002
    case PIX_FMT_UYVY422  :
3003
    case PIX_FMT_YUV420P16LE:
3004
    case PIX_FMT_YUV422P16LE:
3005
    case PIX_FMT_YUV444P16LE:
3006
    case PIX_FMT_GRAY16LE : c->lumToYV12 = RENAME(uyvyToY); break;
3007
    case PIX_FMT_BGR24    : c->lumToYV12 = RENAME(bgr24ToY); break;
3008
    case PIX_FMT_BGR565   : c->lumToYV12 = bgr16ToY; break;
3009
    case PIX_FMT_BGR555   : c->lumToYV12 = bgr15ToY; break;
3010
    case PIX_FMT_RGB24    : c->lumToYV12 = RENAME(rgb24ToY); break;
3011
    case PIX_FMT_RGB565   : c->lumToYV12 = rgb16ToY; break;
3012
    case PIX_FMT_RGB555   : c->lumToYV12 = rgb15ToY; break;
3013
    case PIX_FMT_RGB8     :
3014
    case PIX_FMT_BGR8     :
3015
    case PIX_FMT_PAL8     :
3016
    case PIX_FMT_BGR4_BYTE:
3017
    case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY; break;
3018
    case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y; break;
3019
    case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y; break;
3020
    case PIX_FMT_RGB32  :
3021
    case PIX_FMT_RGB32_1: c->lumToYV12 = bgr32ToY; break;
3022
    case PIX_FMT_BGR32  :
3023
    case PIX_FMT_BGR32_1: c->lumToYV12 = rgb32ToY; break;
3024
    case PIX_FMT_RGB48BE:
3025
    case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48ToY; break;
3026
    }
3027
    if (c->alpPixBuf) {
3028
        switch (srcFormat) {
3029
        case PIX_FMT_RGB32  :
3030
        case PIX_FMT_RGB32_1:
3031
        case PIX_FMT_BGR32  :
3032
        case PIX_FMT_BGR32_1: c->alpToYV12 = abgrToA; break;
3033
        }
3034
    }
3035

    
3036
    switch (srcFormat) {
3037
    case PIX_FMT_RGB32  :
3038
    case PIX_FMT_BGR32  :
3039
        c->alpSrcOffset = 3;
3040
        break;
3041
    case PIX_FMT_RGB32_1:
3042
    case PIX_FMT_BGR32_1:
3043
        c->lumSrcOffset = ALT32_CORR;
3044
        c->chrSrcOffset = ALT32_CORR;
3045
        break;
3046
    case PIX_FMT_RGB48LE:
3047
        c->lumSrcOffset = 1;
3048
        c->chrSrcOffset = 1;
3049
        c->alpSrcOffset = 1;
3050
        break;
3051
    }
3052

    
3053
    if (c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))) {
3054
        if (c->srcRange) {
3055
            c->lumConvertRange = RENAME(lumRangeFromJpeg);
3056
            c->chrConvertRange = RENAME(chrRangeFromJpeg);
3057
        } else {
3058
            c->lumConvertRange = RENAME(lumRangeToJpeg);
3059
            c->chrConvertRange = RENAME(chrRangeToJpeg);
3060
        }
3061
    }
3062

    
3063
    if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
3064
          srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
3065
        c->needs_hcscale = 1;
3066
}