Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ b0e1343b

History | View | Annotate | Download (133 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29

    
30
#if COMPILE_TEMPLATE_AMD3DNOW
31
#define PREFETCH  "prefetch"
32
#define PREFETCHW "prefetchw"
33
#elif COMPILE_TEMPLATE_MMX2
34
#define PREFETCH "prefetchnta"
35
#define PREFETCHW "prefetcht0"
36
#else
37
#define PREFETCH  " # nop"
38
#define PREFETCHW " # nop"
39
#endif
40

    
41
#if COMPILE_TEMPLATE_MMX2
42
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
43
#elif COMPILE_TEMPLATE_AMD3DNOW
44
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
45
#endif
46

    
47
#if COMPILE_TEMPLATE_MMX2
48
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
49
#else
50
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
51
#endif
52
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
53

    
54
#if COMPILE_TEMPLATE_ALTIVEC
55
#include "ppc/swscale_altivec_template.c"
56
#endif
57

    
58
#define YSCALEYUV2YV12X(x, offset, dest, width) \
59
    __asm__ volatile(\
60
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
61
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
62
    "movq                             %%mm3, %%mm4      \n\t"\
63
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
64
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
65
    ASMALIGN(4) /* FIXME Unroll? */\
66
    "1:                                                 \n\t"\
67
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
68
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
69
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
70
    "add                                $16, %%"REG_d"  \n\t"\
71
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
72
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
73
    "pmulhw                           %%mm0, %%mm2      \n\t"\
74
    "pmulhw                           %%mm0, %%mm5      \n\t"\
75
    "paddw                            %%mm2, %%mm3      \n\t"\
76
    "paddw                            %%mm5, %%mm4      \n\t"\
77
    " jnz                                1b             \n\t"\
78
    "psraw                               $3, %%mm3      \n\t"\
79
    "psraw                               $3, %%mm4      \n\t"\
80
    "packuswb                         %%mm4, %%mm3      \n\t"\
81
    MOVNTQ(%%mm3, (%1, %%REGa))\
82
    "add                                 $8, %%"REG_a"  \n\t"\
83
    "cmp                                 %2, %%"REG_a"  \n\t"\
84
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
85
    "movq                             %%mm3, %%mm4      \n\t"\
86
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
87
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
88
    "jb                                  1b             \n\t"\
89
    :: "r" (&c->redDither),\
90
    "r" (dest), "g" (width)\
91
    : "%"REG_a, "%"REG_d, "%"REG_S\
92
    );
93

    
94
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
95
    __asm__ volatile(\
96
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
97
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
98
    "pxor                             %%mm4, %%mm4      \n\t"\
99
    "pxor                             %%mm5, %%mm5      \n\t"\
100
    "pxor                             %%mm6, %%mm6      \n\t"\
101
    "pxor                             %%mm7, %%mm7      \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    ASMALIGN(4) \
104
    "1:                                                 \n\t"\
105
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
106
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
107
    "mov        "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"  \n\t"\
108
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
109
    "movq                             %%mm0, %%mm3      \n\t"\
110
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
111
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
112
    "movq       "STR(APCK_COEF)"(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
113
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
114
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
115
    "paddd                            %%mm0, %%mm4      \n\t"\
116
    "paddd                            %%mm3, %%mm5      \n\t"\
117
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
118
    "mov        "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"  \n\t"\
119
    "add                  $"STR(APCK_SIZE)", %%"REG_d"  \n\t"\
120
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
121
    "movq                             %%mm2, %%mm0      \n\t"\
122
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
123
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
124
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
125
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
126
    "paddd                            %%mm2, %%mm6      \n\t"\
127
    "paddd                            %%mm0, %%mm7      \n\t"\
128
    " jnz                                1b             \n\t"\
129
    "psrad                              $16, %%mm4      \n\t"\
130
    "psrad                              $16, %%mm5      \n\t"\
131
    "psrad                              $16, %%mm6      \n\t"\
132
    "psrad                              $16, %%mm7      \n\t"\
133
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
134
    "packssdw                         %%mm5, %%mm4      \n\t"\
135
    "packssdw                         %%mm7, %%mm6      \n\t"\
136
    "paddw                            %%mm0, %%mm4      \n\t"\
137
    "paddw                            %%mm0, %%mm6      \n\t"\
138
    "psraw                               $3, %%mm4      \n\t"\
139
    "psraw                               $3, %%mm6      \n\t"\
140
    "packuswb                         %%mm6, %%mm4      \n\t"\
141
    MOVNTQ(%%mm4, (%1, %%REGa))\
142
    "add                                 $8, %%"REG_a"  \n\t"\
143
    "cmp                                 %2, %%"REG_a"  \n\t"\
144
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
145
    "pxor                             %%mm4, %%mm4      \n\t"\
146
    "pxor                             %%mm5, %%mm5      \n\t"\
147
    "pxor                             %%mm6, %%mm6      \n\t"\
148
    "pxor                             %%mm7, %%mm7      \n\t"\
149
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
150
    "jb                                  1b             \n\t"\
151
    :: "r" (&c->redDither),\
152
    "r" (dest), "g" (width)\
153
    : "%"REG_a, "%"REG_d, "%"REG_S\
154
    );
155

    
156
#define YSCALEYUV2YV121 \
157
    "mov %2, %%"REG_a"                    \n\t"\
158
    ASMALIGN(4) /* FIXME Unroll? */\
159
    "1:                                   \n\t"\
160
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
161
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
162
    "psraw                 $7, %%mm0      \n\t"\
163
    "psraw                 $7, %%mm1      \n\t"\
164
    "packuswb           %%mm1, %%mm0      \n\t"\
165
    MOVNTQ(%%mm0, (%1, %%REGa))\
166
    "add                   $8, %%"REG_a"  \n\t"\
167
    "jnc                   1b             \n\t"
168

    
169
#define YSCALEYUV2YV121_ACCURATE \
170
    "mov %2, %%"REG_a"                    \n\t"\
171
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
172
    "psrlw                 $15, %%mm7     \n\t"\
173
    "psllw                  $6, %%mm7     \n\t"\
174
    ASMALIGN(4) /* FIXME Unroll? */\
175
    "1:                                   \n\t"\
176
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
177
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
178
    "paddsw             %%mm7, %%mm0      \n\t"\
179
    "paddsw             %%mm7, %%mm1      \n\t"\
180
    "psraw                 $7, %%mm0      \n\t"\
181
    "psraw                 $7, %%mm1      \n\t"\
182
    "packuswb           %%mm1, %%mm0      \n\t"\
183
    MOVNTQ(%%mm0, (%1, %%REGa))\
184
    "add                   $8, %%"REG_a"  \n\t"\
185
    "jnc                   1b             \n\t"
186

    
187
/*
188
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
189
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
190
       "r" (dest), "m" (dstW),
191
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
192
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
193
*/
194
#define YSCALEYUV2PACKEDX_UV \
195
    __asm__ volatile(\
196
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
197
    ASMALIGN(4)\
198
    "nop                                            \n\t"\
199
    "1:                                             \n\t"\
200
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
201
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
202
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
203
    "movq                      %%mm3, %%mm4         \n\t"\
204
    ASMALIGN(4)\
205
    "2:                                             \n\t"\
206
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
207
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
208
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
209
    "add                         $16, %%"REG_d"     \n\t"\
210
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
211
    "pmulhw                    %%mm0, %%mm2         \n\t"\
212
    "pmulhw                    %%mm0, %%mm5         \n\t"\
213
    "paddw                     %%mm2, %%mm3         \n\t"\
214
    "paddw                     %%mm5, %%mm4         \n\t"\
215
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
216
    " jnz                         2b                \n\t"\
217

    
218
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
219
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
220
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
221
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
222
    "movq                    "#dst1", "#dst2"       \n\t"\
223
    ASMALIGN(4)\
224
    "2:                                             \n\t"\
225
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
226
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
227
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
228
    "add                         $16, %%"REG_d"            \n\t"\
229
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
230
    "pmulhw                 "#coeff", "#src1"       \n\t"\
231
    "pmulhw                 "#coeff", "#src2"       \n\t"\
232
    "paddw                   "#src1", "#dst1"       \n\t"\
233
    "paddw                   "#src2", "#dst2"       \n\t"\
234
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
235
    " jnz                         2b                \n\t"\
236

    
237
#define YSCALEYUV2PACKEDX \
238
    YSCALEYUV2PACKEDX_UV \
239
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
240

    
241
#define YSCALEYUV2PACKEDX_END                 \
242
    :: "r" (&c->redDither),                   \
243
        "m" (dummy), "m" (dummy), "m" (dummy),\
244
        "r" (dest), "m" (dstW)                \
245
    : "%"REG_a, "%"REG_d, "%"REG_S            \
246
    );
247

    
248
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
249
    __asm__ volatile(\
250
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
251
    ASMALIGN(4)\
252
    "nop                                            \n\t"\
253
    "1:                                             \n\t"\
254
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
255
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
256
    "pxor                      %%mm4, %%mm4         \n\t"\
257
    "pxor                      %%mm5, %%mm5         \n\t"\
258
    "pxor                      %%mm6, %%mm6         \n\t"\
259
    "pxor                      %%mm7, %%mm7         \n\t"\
260
    ASMALIGN(4)\
261
    "2:                                             \n\t"\
262
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
263
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
264
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
265
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
266
    "movq                      %%mm0, %%mm3         \n\t"\
267
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
268
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
269
    "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
270
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
271
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
272
    "paddd                     %%mm0, %%mm4         \n\t"\
273
    "paddd                     %%mm3, %%mm5         \n\t"\
274
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
275
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
276
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
277
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
278
    "movq                      %%mm2, %%mm0         \n\t"\
279
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
280
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
282
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
283
    "paddd                     %%mm2, %%mm6         \n\t"\
284
    "paddd                     %%mm0, %%mm7         \n\t"\
285
    " jnz                         2b                \n\t"\
286
    "psrad                       $16, %%mm4         \n\t"\
287
    "psrad                       $16, %%mm5         \n\t"\
288
    "psrad                       $16, %%mm6         \n\t"\
289
    "psrad                       $16, %%mm7         \n\t"\
290
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
291
    "packssdw                  %%mm5, %%mm4         \n\t"\
292
    "packssdw                  %%mm7, %%mm6         \n\t"\
293
    "paddw                     %%mm0, %%mm4         \n\t"\
294
    "paddw                     %%mm0, %%mm6         \n\t"\
295
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
296
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
297

    
298
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
299
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
300
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
301
    "pxor                      %%mm1, %%mm1         \n\t"\
302
    "pxor                      %%mm5, %%mm5         \n\t"\
303
    "pxor                      %%mm7, %%mm7         \n\t"\
304
    "pxor                      %%mm6, %%mm6         \n\t"\
305
    ASMALIGN(4)\
306
    "2:                                             \n\t"\
307
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
308
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
309
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
310
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
311
    "movq                      %%mm0, %%mm3         \n\t"\
312
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
313
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
314
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
315
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
316
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
317
    "paddd                     %%mm0, %%mm1         \n\t"\
318
    "paddd                     %%mm3, %%mm5         \n\t"\
319
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
320
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
321
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
322
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
323
    "movq                      %%mm2, %%mm0         \n\t"\
324
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
325
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
326
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
327
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
328
    "paddd                     %%mm2, %%mm7         \n\t"\
329
    "paddd                     %%mm0, %%mm6         \n\t"\
330
    " jnz                         2b                \n\t"\
331
    "psrad                       $16, %%mm1         \n\t"\
332
    "psrad                       $16, %%mm5         \n\t"\
333
    "psrad                       $16, %%mm7         \n\t"\
334
    "psrad                       $16, %%mm6         \n\t"\
335
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
336
    "packssdw                  %%mm5, %%mm1         \n\t"\
337
    "packssdw                  %%mm6, %%mm7         \n\t"\
338
    "paddw                     %%mm0, %%mm1         \n\t"\
339
    "paddw                     %%mm0, %%mm7         \n\t"\
340
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
341
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
342

    
343
#define YSCALEYUV2PACKEDX_ACCURATE \
344
    YSCALEYUV2PACKEDX_ACCURATE_UV \
345
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
346

    
347
#define YSCALEYUV2RGBX \
348
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
349
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
350
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
351
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
352
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
353
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
354
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
355
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
356
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
357
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
358
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
359
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
360
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
361
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
362
    "paddw           %%mm3, %%mm4       \n\t"\
363
    "movq            %%mm2, %%mm0       \n\t"\
364
    "movq            %%mm5, %%mm6       \n\t"\
365
    "movq            %%mm4, %%mm3       \n\t"\
366
    "punpcklwd       %%mm2, %%mm2       \n\t"\
367
    "punpcklwd       %%mm5, %%mm5       \n\t"\
368
    "punpcklwd       %%mm4, %%mm4       \n\t"\
369
    "paddw           %%mm1, %%mm2       \n\t"\
370
    "paddw           %%mm1, %%mm5       \n\t"\
371
    "paddw           %%mm1, %%mm4       \n\t"\
372
    "punpckhwd       %%mm0, %%mm0       \n\t"\
373
    "punpckhwd       %%mm6, %%mm6       \n\t"\
374
    "punpckhwd       %%mm3, %%mm3       \n\t"\
375
    "paddw           %%mm7, %%mm0       \n\t"\
376
    "paddw           %%mm7, %%mm6       \n\t"\
377
    "paddw           %%mm7, %%mm3       \n\t"\
378
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
379
    "packuswb        %%mm0, %%mm2       \n\t"\
380
    "packuswb        %%mm6, %%mm5       \n\t"\
381
    "packuswb        %%mm3, %%mm4       \n\t"\
382

    
383
#define REAL_YSCALEYUV2PACKED(index, c) \
384
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
385
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
386
    "psraw                $3, %%mm0                           \n\t"\
387
    "psraw                $3, %%mm1                           \n\t"\
388
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
389
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
390
    "xor            "#index", "#index"                        \n\t"\
391
    ASMALIGN(4)\
392
    "1:                                 \n\t"\
393
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
394
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
395
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
396
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
397
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
398
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
399
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
400
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
401
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
402
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
403
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
404
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
405
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
406
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
407
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
408
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
409
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
410
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
411
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
412
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
413
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
414
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
415
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
416
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
417
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
418

    
419
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
420

    
421
#define REAL_YSCALEYUV2RGB_UV(index, c) \
422
    "xor            "#index", "#index"  \n\t"\
423
    ASMALIGN(4)\
424
    "1:                                 \n\t"\
425
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
426
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
427
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
428
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
429
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
430
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
431
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
432
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
433
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
434
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
435
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
436
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
437
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
438
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
439
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
440
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
441
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
442
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
443
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
444
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
445

    
446
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
447
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
448
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
449
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
450
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
451
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
452
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
456
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
458
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459

    
460
#define REAL_YSCALEYUV2RGB_COEFF(c) \
461
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
462
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
463
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
464
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
465
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
466
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
467
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
468
    "paddw             %%mm3, %%mm4     \n\t"\
469
    "movq              %%mm2, %%mm0     \n\t"\
470
    "movq              %%mm5, %%mm6     \n\t"\
471
    "movq              %%mm4, %%mm3     \n\t"\
472
    "punpcklwd         %%mm2, %%mm2     \n\t"\
473
    "punpcklwd         %%mm5, %%mm5     \n\t"\
474
    "punpcklwd         %%mm4, %%mm4     \n\t"\
475
    "paddw             %%mm1, %%mm2     \n\t"\
476
    "paddw             %%mm1, %%mm5     \n\t"\
477
    "paddw             %%mm1, %%mm4     \n\t"\
478
    "punpckhwd         %%mm0, %%mm0     \n\t"\
479
    "punpckhwd         %%mm6, %%mm6     \n\t"\
480
    "punpckhwd         %%mm3, %%mm3     \n\t"\
481
    "paddw             %%mm7, %%mm0     \n\t"\
482
    "paddw             %%mm7, %%mm6     \n\t"\
483
    "paddw             %%mm7, %%mm3     \n\t"\
484
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
485
    "packuswb          %%mm0, %%mm2     \n\t"\
486
    "packuswb          %%mm6, %%mm5     \n\t"\
487
    "packuswb          %%mm3, %%mm4     \n\t"\
488

    
489
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
490

    
491
#define YSCALEYUV2RGB(index, c) \
492
    REAL_YSCALEYUV2RGB_UV(index, c) \
493
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
494
    REAL_YSCALEYUV2RGB_COEFF(c)
495

    
496
#define REAL_YSCALEYUV2PACKED1(index, c) \
497
    "xor            "#index", "#index"  \n\t"\
498
    ASMALIGN(4)\
499
    "1:                                 \n\t"\
500
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
501
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
502
    "psraw                $7, %%mm3     \n\t" \
503
    "psraw                $7, %%mm4     \n\t" \
504
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
505
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
506
    "psraw                $7, %%mm1     \n\t" \
507
    "psraw                $7, %%mm7     \n\t" \
508

    
509
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
510

    
511
#define REAL_YSCALEYUV2RGB1(index, c) \
512
    "xor            "#index", "#index"  \n\t"\
513
    ASMALIGN(4)\
514
    "1:                                 \n\t"\
515
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
516
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
517
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
518
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
519
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
520
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
521
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
522
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
523
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
524
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
525
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
526
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
527
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
528
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
529
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
530
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
531
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
532
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
533
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
534
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
535
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
536
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
537
    "paddw             %%mm3, %%mm4     \n\t"\
538
    "movq              %%mm2, %%mm0     \n\t"\
539
    "movq              %%mm5, %%mm6     \n\t"\
540
    "movq              %%mm4, %%mm3     \n\t"\
541
    "punpcklwd         %%mm2, %%mm2     \n\t"\
542
    "punpcklwd         %%mm5, %%mm5     \n\t"\
543
    "punpcklwd         %%mm4, %%mm4     \n\t"\
544
    "paddw             %%mm1, %%mm2     \n\t"\
545
    "paddw             %%mm1, %%mm5     \n\t"\
546
    "paddw             %%mm1, %%mm4     \n\t"\
547
    "punpckhwd         %%mm0, %%mm0     \n\t"\
548
    "punpckhwd         %%mm6, %%mm6     \n\t"\
549
    "punpckhwd         %%mm3, %%mm3     \n\t"\
550
    "paddw             %%mm7, %%mm0     \n\t"\
551
    "paddw             %%mm7, %%mm6     \n\t"\
552
    "paddw             %%mm7, %%mm3     \n\t"\
553
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
554
    "packuswb          %%mm0, %%mm2     \n\t"\
555
    "packuswb          %%mm6, %%mm5     \n\t"\
556
    "packuswb          %%mm3, %%mm4     \n\t"\
557

    
558
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
559

    
560
#define REAL_YSCALEYUV2PACKED1b(index, c) \
561
    "xor "#index", "#index"             \n\t"\
562
    ASMALIGN(4)\
563
    "1:                                 \n\t"\
564
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
565
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
566
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
568
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
569
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
570
    "psrlw                $8, %%mm3     \n\t" \
571
    "psrlw                $8, %%mm4     \n\t" \
572
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
573
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
574
    "psraw                $7, %%mm1     \n\t" \
575
    "psraw                $7, %%mm7     \n\t"
576
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
577

    
578
// do vertical chrominance interpolation
579
#define REAL_YSCALEYUV2RGB1b(index, c) \
580
    "xor            "#index", "#index"  \n\t"\
581
    ASMALIGN(4)\
582
    "1:                                 \n\t"\
583
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
584
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
585
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
586
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
587
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
588
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
589
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
590
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
591
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
592
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
593
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
594
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
595
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
596
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
597
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
598
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
599
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
600
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
601
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
602
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
603
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
604
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
605
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
606
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
607
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
608
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
609
    "paddw             %%mm3, %%mm4     \n\t"\
610
    "movq              %%mm2, %%mm0     \n\t"\
611
    "movq              %%mm5, %%mm6     \n\t"\
612
    "movq              %%mm4, %%mm3     \n\t"\
613
    "punpcklwd         %%mm2, %%mm2     \n\t"\
614
    "punpcklwd         %%mm5, %%mm5     \n\t"\
615
    "punpcklwd         %%mm4, %%mm4     \n\t"\
616
    "paddw             %%mm1, %%mm2     \n\t"\
617
    "paddw             %%mm1, %%mm5     \n\t"\
618
    "paddw             %%mm1, %%mm4     \n\t"\
619
    "punpckhwd         %%mm0, %%mm0     \n\t"\
620
    "punpckhwd         %%mm6, %%mm6     \n\t"\
621
    "punpckhwd         %%mm3, %%mm3     \n\t"\
622
    "paddw             %%mm7, %%mm0     \n\t"\
623
    "paddw             %%mm7, %%mm6     \n\t"\
624
    "paddw             %%mm7, %%mm3     \n\t"\
625
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
626
    "packuswb          %%mm0, %%mm2     \n\t"\
627
    "packuswb          %%mm6, %%mm5     \n\t"\
628
    "packuswb          %%mm3, %%mm4     \n\t"\
629

    
630
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
631

    
632
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
633
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
634
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
635
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
636
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
637
    "packuswb          %%mm1, %%mm7     \n\t"
638
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
639

    
640
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
641
    "movq       "#b", "#q2"     \n\t" /* B */\
642
    "movq       "#r", "#t"      \n\t" /* R */\
643
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
644
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
645
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
646
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
647
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
648
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
649
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
650
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
651
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
652
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
653
\
654
    MOVNTQ(   q0,   (dst, index, 4))\
655
    MOVNTQ(    b,  8(dst, index, 4))\
656
    MOVNTQ(   q2, 16(dst, index, 4))\
657
    MOVNTQ(   q3, 24(dst, index, 4))\
658
\
659
    "add      $8, "#index"      \n\t"\
660
    "cmp "#dstw", "#index"      \n\t"\
661
    " jb      1b                \n\t"
662
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
663

    
664
#define REAL_WRITERGB16(dst, dstw, index) \
665
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
666
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
667
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
668
    "psrlq           $3, %%mm2  \n\t"\
669
\
670
    "movq         %%mm2, %%mm1  \n\t"\
671
    "movq         %%mm4, %%mm3  \n\t"\
672
\
673
    "punpcklbw    %%mm7, %%mm3  \n\t"\
674
    "punpcklbw    %%mm5, %%mm2  \n\t"\
675
    "punpckhbw    %%mm7, %%mm4  \n\t"\
676
    "punpckhbw    %%mm5, %%mm1  \n\t"\
677
\
678
    "psllq           $3, %%mm3  \n\t"\
679
    "psllq           $3, %%mm4  \n\t"\
680
\
681
    "por          %%mm3, %%mm2  \n\t"\
682
    "por          %%mm4, %%mm1  \n\t"\
683
\
684
    MOVNTQ(%%mm2,  (dst, index, 2))\
685
    MOVNTQ(%%mm1, 8(dst, index, 2))\
686
\
687
    "add             $8, "#index"   \n\t"\
688
    "cmp        "#dstw", "#index"   \n\t"\
689
    " jb             1b             \n\t"
690
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
691

    
692
#define REAL_WRITERGB15(dst, dstw, index) \
693
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
694
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
695
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
696
    "psrlq           $3, %%mm2  \n\t"\
697
    "psrlq           $1, %%mm5  \n\t"\
698
\
699
    "movq         %%mm2, %%mm1  \n\t"\
700
    "movq         %%mm4, %%mm3  \n\t"\
701
\
702
    "punpcklbw    %%mm7, %%mm3  \n\t"\
703
    "punpcklbw    %%mm5, %%mm2  \n\t"\
704
    "punpckhbw    %%mm7, %%mm4  \n\t"\
705
    "punpckhbw    %%mm5, %%mm1  \n\t"\
706
\
707
    "psllq           $2, %%mm3  \n\t"\
708
    "psllq           $2, %%mm4  \n\t"\
709
\
710
    "por          %%mm3, %%mm2  \n\t"\
711
    "por          %%mm4, %%mm1  \n\t"\
712
\
713
    MOVNTQ(%%mm2,  (dst, index, 2))\
714
    MOVNTQ(%%mm1, 8(dst, index, 2))\
715
\
716
    "add             $8, "#index"   \n\t"\
717
    "cmp        "#dstw", "#index"   \n\t"\
718
    " jb             1b             \n\t"
719
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
720

    
721
#define WRITEBGR24OLD(dst, dstw, index) \
722
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
723
    "movq      %%mm2, %%mm1             \n\t" /* B */\
724
    "movq      %%mm5, %%mm6             \n\t" /* R */\
725
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
726
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
727
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
728
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
729
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
730
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
731
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
732
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
733
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
734
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
735
\
736
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
737
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
738
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
739
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
740
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
741
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
742
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
743
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
744
\
745
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
746
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
747
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
748
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
749
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
750
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
751
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
752
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
753
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
754
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
755
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
756
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
757
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
758
\
759
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
760
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
761
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
762
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
763
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
764
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
765
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
766
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
767
\
768
    MOVNTQ(%%mm0,   (dst))\
769
    MOVNTQ(%%mm2,  8(dst))\
770
    MOVNTQ(%%mm3, 16(dst))\
771
    "add         $24, "#dst"            \n\t"\
772
\
773
    "add          $8, "#index"          \n\t"\
774
    "cmp     "#dstw", "#index"          \n\t"\
775
    " jb          1b                    \n\t"
776

    
777
#define WRITEBGR24MMX(dst, dstw, index) \
778
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
779
    "movq      %%mm2, %%mm1     \n\t" /* B */\
780
    "movq      %%mm5, %%mm6     \n\t" /* R */\
781
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
782
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
783
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
784
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
785
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
786
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
787
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
788
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
789
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
790
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
791
\
792
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
793
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
794
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
795
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
796
\
797
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
798
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
799
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
800
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
801
\
802
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
803
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
804
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
805
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
806
\
807
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
808
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
809
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
810
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
811
    MOVNTQ(%%mm0, (dst))\
812
\
813
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
814
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
815
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
816
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
817
    MOVNTQ(%%mm6, 8(dst))\
818
\
819
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
820
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
821
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
822
    MOVNTQ(%%mm5, 16(dst))\
823
\
824
    "add         $24, "#dst"    \n\t"\
825
\
826
    "add          $8, "#index"  \n\t"\
827
    "cmp     "#dstw", "#index"  \n\t"\
828
    " jb          1b            \n\t"
829

    
830
#define WRITEBGR24MMX2(dst, dstw, index) \
831
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
832
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
833
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
834
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
835
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
836
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
837
\
838
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
839
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
840
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
841
\
842
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
843
    "por    %%mm1, %%mm6        \n\t"\
844
    "por    %%mm3, %%mm6        \n\t"\
845
    MOVNTQ(%%mm6, (dst))\
846
\
847
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
848
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
849
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
850
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
851
\
852
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
853
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
854
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
855
\
856
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
857
    "por    %%mm3, %%mm6        \n\t"\
858
    MOVNTQ(%%mm6, 8(dst))\
859
\
860
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
861
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
862
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
863
\
864
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
865
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
866
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
867
\
868
    "por    %%mm1, %%mm3        \n\t"\
869
    "por    %%mm3, %%mm6        \n\t"\
870
    MOVNTQ(%%mm6, 16(dst))\
871
\
872
    "add      $24, "#dst"       \n\t"\
873
\
874
    "add       $8, "#index"     \n\t"\
875
    "cmp  "#dstw", "#index"     \n\t"\
876
    " jb       1b               \n\t"
877

    
878
#if COMPILE_TEMPLATE_MMX2
879
#undef WRITEBGR24
880
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
881
#else
882
#undef WRITEBGR24
883
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
884
#endif
885

    
886
#define REAL_WRITEYUY2(dst, dstw, index) \
887
    "packuswb  %%mm3, %%mm3     \n\t"\
888
    "packuswb  %%mm4, %%mm4     \n\t"\
889
    "packuswb  %%mm7, %%mm1     \n\t"\
890
    "punpcklbw %%mm4, %%mm3     \n\t"\
891
    "movq      %%mm1, %%mm7     \n\t"\
892
    "punpcklbw %%mm3, %%mm1     \n\t"\
893
    "punpckhbw %%mm3, %%mm7     \n\t"\
894
\
895
    MOVNTQ(%%mm1, (dst, index, 2))\
896
    MOVNTQ(%%mm7, 8(dst, index, 2))\
897
\
898
    "add          $8, "#index"  \n\t"\
899
    "cmp     "#dstw", "#index"  \n\t"\
900
    " jb          1b            \n\t"
901
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
902

    
903

    
904
static inline void RENAME(yuv2yuvX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
905
                                    const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize, const int16_t **alpSrc,
906
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
907
{
908
#if COMPILE_TEMPLATE_MMX
909
    if(!(c->flags & SWS_BITEXACT)){
910
        if (c->flags & SWS_ACCURATE_RND){
911
            if (uDest){
912
                YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
913
                YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
914
            }
915
            if (CONFIG_SWSCALE_ALPHA && aDest){
916
                YSCALEYUV2YV12X_ACCURATE(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
917
            }
918

    
919
            YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
920
        }else{
921
            if (uDest){
922
                YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
923
                YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
924
            }
925
            if (CONFIG_SWSCALE_ALPHA && aDest){
926
                YSCALEYUV2YV12X(   "0", ALP_MMX_FILTER_OFFSET, aDest, dstW)
927
            }
928

    
929
            YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
930
        }
931
        return;
932
    }
933
#endif
934
#if COMPILE_TEMPLATE_ALTIVEC
935
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
936
                      chrFilter, chrSrc, chrFilterSize,
937
                      dest, uDest, vDest, dstW, chrDstW);
938
#else //COMPILE_TEMPLATE_ALTIVEC
939
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
940
            chrFilter, chrSrc, chrFilterSize,
941
            alpSrc, dest, uDest, vDest, aDest, dstW, chrDstW);
942
#endif //!COMPILE_TEMPLATE_ALTIVEC
943
}
944

    
945
static inline void RENAME(yuv2nv12X)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
946
                                     const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
947
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
948
{
949
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
950
             chrFilter, chrSrc, chrFilterSize,
951
             dest, uDest, dstW, chrDstW, dstFormat);
952
}
953

    
954
static inline void RENAME(yuv2yuv1)(SwsContext *c, const int16_t *lumSrc, const int16_t *chrSrc, const int16_t *alpSrc,
955
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, uint8_t *aDest, long dstW, long chrDstW)
956
{
957
    int i;
958
#if COMPILE_TEMPLATE_MMX
959
    if(!(c->flags & SWS_BITEXACT)){
960
        long p= 4;
961
        uint8_t *src[4]= {alpSrc + dstW, lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
962
        uint8_t *dst[4]= {aDest, dest, uDest, vDest};
963
        x86_reg counter[4]= {dstW, dstW, chrDstW, chrDstW};
964

    
965
        if (c->flags & SWS_ACCURATE_RND){
966
            while(p--){
967
                if (dst[p]){
968
                    __asm__ volatile(
969
                        YSCALEYUV2YV121_ACCURATE
970
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
971
                        "g" (-counter[p])
972
                        : "%"REG_a
973
                    );
974
                }
975
            }
976
        }else{
977
            while(p--){
978
                if (dst[p]){
979
                    __asm__ volatile(
980
                        YSCALEYUV2YV121
981
                        :: "r" (src[p]), "r" (dst[p] + counter[p]),
982
                        "g" (-counter[p])
983
                        : "%"REG_a
984
                    );
985
                }
986
            }
987
        }
988
        return;
989
    }
990
#endif
991
    for (i=0; i<dstW; i++)
992
    {
993
        int val= (lumSrc[i]+64)>>7;
994

    
995
        if (val&256){
996
            if (val<0) val=0;
997
            else       val=255;
998
        }
999

    
1000
        dest[i]= val;
1001
    }
1002

    
1003
    if (uDest)
1004
        for (i=0; i<chrDstW; i++)
1005
        {
1006
            int u=(chrSrc[i       ]+64)>>7;
1007
            int v=(chrSrc[i + VOFW]+64)>>7;
1008

    
1009
            if ((u|v)&256){
1010
                if (u<0)        u=0;
1011
                else if (u>255) u=255;
1012
                if (v<0)        v=0;
1013
                else if (v>255) v=255;
1014
            }
1015

    
1016
            uDest[i]= u;
1017
            vDest[i]= v;
1018
        }
1019

    
1020
    if (CONFIG_SWSCALE_ALPHA && aDest)
1021
        for (i=0; i<dstW; i++){
1022
            int val= (alpSrc[i]+64)>>7;
1023
            aDest[i]= av_clip_uint8(val);
1024
        }
1025
}
1026

    
1027

    
1028
/**
1029
 * vertical scale YV12 to RGB
1030
 */
1031
static inline void RENAME(yuv2packedX)(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize,
1032
                                       const int16_t *chrFilter, const int16_t **chrSrc, int chrFilterSize,
1033
                                       const int16_t **alpSrc, uint8_t *dest, long dstW, long dstY)
1034
{
1035
#if COMPILE_TEMPLATE_MMX
1036
    x86_reg dummy=0;
1037
    if(!(c->flags & SWS_BITEXACT)){
1038
        if (c->flags & SWS_ACCURATE_RND){
1039
            switch(c->dstFormat){
1040
            case PIX_FMT_RGB32:
1041
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1042
                    YSCALEYUV2PACKEDX_ACCURATE
1043
                    YSCALEYUV2RGBX
1044
                    "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
1045
                    "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
1046
                    "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
1047
                    YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
1048
                    "movq               "Y_TEMP"(%0), %%mm5         \n\t"
1049
                    "psraw                        $3, %%mm1         \n\t"
1050
                    "psraw                        $3, %%mm7         \n\t"
1051
                    "packuswb                  %%mm7, %%mm1         \n\t"
1052
                    WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
1053

    
1054
                    YSCALEYUV2PACKEDX_END
1055
                }else{
1056
                    YSCALEYUV2PACKEDX_ACCURATE
1057
                    YSCALEYUV2RGBX
1058
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1059
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1060

    
1061
                    YSCALEYUV2PACKEDX_END
1062
                }
1063
                return;
1064
            case PIX_FMT_BGR24:
1065
                YSCALEYUV2PACKEDX_ACCURATE
1066
                YSCALEYUV2RGBX
1067
                "pxor %%mm7, %%mm7 \n\t"
1068
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1069
                "add %4, %%"REG_c"                        \n\t"
1070
                WRITEBGR24(%%REGc, %5, %%REGa)
1071

    
1072

    
1073
                :: "r" (&c->redDither),
1074
                "m" (dummy), "m" (dummy), "m" (dummy),
1075
                "r" (dest), "m" (dstW)
1076
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1077
                );
1078
                return;
1079
            case PIX_FMT_RGB555:
1080
                YSCALEYUV2PACKEDX_ACCURATE
1081
                YSCALEYUV2RGBX
1082
                "pxor %%mm7, %%mm7 \n\t"
1083
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1084
#ifdef DITHER1XBPP
1085
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1086
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1087
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1088
#endif
1089

    
1090
                WRITERGB15(%4, %5, %%REGa)
1091
                YSCALEYUV2PACKEDX_END
1092
                return;
1093
            case PIX_FMT_RGB565:
1094
                YSCALEYUV2PACKEDX_ACCURATE
1095
                YSCALEYUV2RGBX
1096
                "pxor %%mm7, %%mm7 \n\t"
1097
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1098
#ifdef DITHER1XBPP
1099
                "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
1100
                "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
1101
                "paddusb "RED_DITHER"(%0), %%mm5\n\t"
1102
#endif
1103

    
1104
                WRITERGB16(%4, %5, %%REGa)
1105
                YSCALEYUV2PACKEDX_END
1106
                return;
1107
            case PIX_FMT_YUYV422:
1108
                YSCALEYUV2PACKEDX_ACCURATE
1109
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1110

    
1111
                "psraw $3, %%mm3    \n\t"
1112
                "psraw $3, %%mm4    \n\t"
1113
                "psraw $3, %%mm1    \n\t"
1114
                "psraw $3, %%mm7    \n\t"
1115
                WRITEYUY2(%4, %5, %%REGa)
1116
                YSCALEYUV2PACKEDX_END
1117
                return;
1118
            }
1119
        }else{
1120
            switch(c->dstFormat)
1121
            {
1122
            case PIX_FMT_RGB32:
1123
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1124
                    YSCALEYUV2PACKEDX
1125
                    YSCALEYUV2RGBX
1126
                    YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
1127
                    "psraw                        $3, %%mm1         \n\t"
1128
                    "psraw                        $3, %%mm7         \n\t"
1129
                    "packuswb                  %%mm7, %%mm1         \n\t"
1130
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1131
                    YSCALEYUV2PACKEDX_END
1132
                }else{
1133
                    YSCALEYUV2PACKEDX
1134
                    YSCALEYUV2RGBX
1135
                    "pcmpeqd %%mm7, %%mm7 \n\t"
1136
                    WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1137
                    YSCALEYUV2PACKEDX_END
1138
                }
1139
                return;
1140
            case PIX_FMT_BGR24:
1141
                YSCALEYUV2PACKEDX
1142
                YSCALEYUV2RGBX
1143
                "pxor                    %%mm7, %%mm7       \n\t"
1144
                "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1145
                "add                        %4, %%"REG_c"   \n\t"
1146
                WRITEBGR24(%%REGc, %5, %%REGa)
1147

    
1148
                :: "r" (&c->redDither),
1149
                "m" (dummy), "m" (dummy), "m" (dummy),
1150
                "r" (dest),  "m" (dstW)
1151
                : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1152
                );
1153
                return;
1154
            case PIX_FMT_RGB555:
1155
                YSCALEYUV2PACKEDX
1156
                YSCALEYUV2RGBX
1157
                "pxor %%mm7, %%mm7 \n\t"
1158
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1159
#ifdef DITHER1XBPP
1160
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1161
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1162
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1163
#endif
1164

    
1165
                WRITERGB15(%4, %5, %%REGa)
1166
                YSCALEYUV2PACKEDX_END
1167
                return;
1168
            case PIX_FMT_RGB565:
1169
                YSCALEYUV2PACKEDX
1170
                YSCALEYUV2RGBX
1171
                "pxor %%mm7, %%mm7 \n\t"
1172
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1173
#ifdef DITHER1XBPP
1174
                "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
1175
                "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
1176
                "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
1177
#endif
1178

    
1179
                WRITERGB16(%4, %5, %%REGa)
1180
                YSCALEYUV2PACKEDX_END
1181
                return;
1182
            case PIX_FMT_YUYV422:
1183
                YSCALEYUV2PACKEDX
1184
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1185

    
1186
                "psraw $3, %%mm3    \n\t"
1187
                "psraw $3, %%mm4    \n\t"
1188
                "psraw $3, %%mm1    \n\t"
1189
                "psraw $3, %%mm7    \n\t"
1190
                WRITEYUY2(%4, %5, %%REGa)
1191
                YSCALEYUV2PACKEDX_END
1192
                return;
1193
            }
1194
        }
1195
    }
1196
#endif /* COMPILE_TEMPLATE_MMX */
1197
#if COMPILE_TEMPLATE_ALTIVEC
1198
    /* The following list of supported dstFormat values should
1199
       match what's found in the body of ff_yuv2packedX_altivec() */
1200
    if (!(c->flags & SWS_BITEXACT) && !c->alpPixBuf &&
1201
       (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1202
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1203
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB))
1204
            ff_yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize,
1205
                                   chrFilter, chrSrc, chrFilterSize,
1206
                                   dest, dstW, dstY);
1207
    else
1208
#endif
1209
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1210
                       chrFilter, chrSrc, chrFilterSize,
1211
                       alpSrc, dest, dstW, dstY);
1212
}
1213

    
1214
/**
1215
 * vertical bilinear scale YV12 to RGB
1216
 */
1217
static inline void RENAME(yuv2packed2)(SwsContext *c, const uint16_t *buf0, const uint16_t *buf1, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1218
                          const uint16_t *abuf0, const uint16_t *abuf1, uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1219
{
1220
    int  yalpha1=4095- yalpha;
1221
    int uvalpha1=4095-uvalpha;
1222
    int i;
1223

    
1224
#if COMPILE_TEMPLATE_MMX
1225
    if(!(c->flags & SWS_BITEXACT)){
1226
        switch(c->dstFormat)
1227
        {
1228
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1229
            case PIX_FMT_RGB32:
1230
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1231
#if ARCH_X86_64
1232
                    __asm__ volatile(
1233
                    YSCALEYUV2RGB(%%REGBP, %5)
1234
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %6, %7)
1235
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1236
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1237
                    "packuswb            %%mm7, %%mm1       \n\t"
1238
                    WRITEBGR32(%4, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1239

    
1240
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "r" (dest),
1241
                    "a" (&c->redDither)
1242
                    ,"r" (abuf0), "r" (abuf1)
1243
                    : "%"REG_BP
1244
                    );
1245
#else
1246
                    *(uint16_t **)(&c->u_temp)=abuf0;
1247
                    *(uint16_t **)(&c->v_temp)=abuf1;
1248
                    __asm__ volatile(
1249
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1250
                    "mov        %4, %%"REG_b"               \n\t"
1251
                    "push %%"REG_BP"                        \n\t"
1252
                    YSCALEYUV2RGB(%%REGBP, %5)
1253
                    "push                   %0              \n\t"
1254
                    "push                   %1              \n\t"
1255
                    "mov          "U_TEMP"(%5), %0          \n\t"
1256
                    "mov          "V_TEMP"(%5), %1          \n\t"
1257
                    YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
1258
                    "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1259
                    "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
1260
                    "packuswb            %%mm7, %%mm1       \n\t"
1261
                    "pop                    %1              \n\t"
1262
                    "pop                    %0              \n\t"
1263
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
1264
                    "pop %%"REG_BP"                         \n\t"
1265
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1266

    
1267
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1268
                    "a" (&c->redDither)
1269
                    );
1270
#endif
1271
                }else{
1272
                    __asm__ volatile(
1273
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1274
                    "mov        %4, %%"REG_b"               \n\t"
1275
                    "push %%"REG_BP"                        \n\t"
1276
                    YSCALEYUV2RGB(%%REGBP, %5)
1277
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1278
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1279
                    "pop %%"REG_BP"                         \n\t"
1280
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1281

    
1282
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1283
                    "a" (&c->redDither)
1284
                    );
1285
                }
1286
                return;
1287
            case PIX_FMT_BGR24:
1288
                __asm__ volatile(
1289
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1290
                "mov        %4, %%"REG_b"               \n\t"
1291
                "push %%"REG_BP"                        \n\t"
1292
                YSCALEYUV2RGB(%%REGBP, %5)
1293
                "pxor    %%mm7, %%mm7                   \n\t"
1294
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1295
                "pop %%"REG_BP"                         \n\t"
1296
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1297
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1298
                "a" (&c->redDither)
1299
                );
1300
                return;
1301
            case PIX_FMT_RGB555:
1302
                __asm__ volatile(
1303
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1304
                "mov        %4, %%"REG_b"               \n\t"
1305
                "push %%"REG_BP"                        \n\t"
1306
                YSCALEYUV2RGB(%%REGBP, %5)
1307
                "pxor    %%mm7, %%mm7                   \n\t"
1308
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1309
#ifdef DITHER1XBPP
1310
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1311
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1312
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1313
#endif
1314

    
1315
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1316
                "pop %%"REG_BP"                         \n\t"
1317
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1318

    
1319
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1320
                "a" (&c->redDither)
1321
                );
1322
                return;
1323
            case PIX_FMT_RGB565:
1324
                __asm__ volatile(
1325
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1326
                "mov        %4, %%"REG_b"               \n\t"
1327
                "push %%"REG_BP"                        \n\t"
1328
                YSCALEYUV2RGB(%%REGBP, %5)
1329
                "pxor    %%mm7, %%mm7                   \n\t"
1330
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1331
#ifdef DITHER1XBPP
1332
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1333
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1334
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1335
#endif
1336

    
1337
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1338
                "pop %%"REG_BP"                         \n\t"
1339
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1340
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1341
                "a" (&c->redDither)
1342
                );
1343
                return;
1344
            case PIX_FMT_YUYV422:
1345
                __asm__ volatile(
1346
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1347
                "mov %4, %%"REG_b"                        \n\t"
1348
                "push %%"REG_BP"                        \n\t"
1349
                YSCALEYUV2PACKED(%%REGBP, %5)
1350
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1351
                "pop %%"REG_BP"                         \n\t"
1352
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1353
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1354
                "a" (&c->redDither)
1355
                );
1356
                return;
1357
            default: break;
1358
        }
1359
    }
1360
#endif //COMPILE_TEMPLATE_MMX
1361
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C(void,0), YSCALE_YUV_2_GRAY16_2_C, YSCALE_YUV_2_MONO2_C)
1362
}
1363

    
1364
/**
1365
 * YV12 to RGB without scaling or interpolating
1366
 */
1367
static inline void RENAME(yuv2packed1)(SwsContext *c, const uint16_t *buf0, const uint16_t *uvbuf0, const uint16_t *uvbuf1,
1368
                          const uint16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1369
{
1370
    const int yalpha1=0;
1371
    int i;
1372

    
1373
    const uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1374
    const int yalpha= 4096; //FIXME ...
1375

    
1376
    if (flags&SWS_FULL_CHR_H_INT)
1377
    {
1378
        c->yuv2packed2(c, buf0, buf0, uvbuf0, uvbuf1, abuf0, abuf0, dest, dstW, 0, uvalpha, y);
1379
        return;
1380
    }
1381

    
1382
#if COMPILE_TEMPLATE_MMX
1383
    if(!(flags & SWS_BITEXACT)){
1384
        if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1385
        {
1386
            switch(dstFormat)
1387
            {
1388
            case PIX_FMT_RGB32:
1389
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1390
                    __asm__ volatile(
1391
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1392
                    "mov        %4, %%"REG_b"               \n\t"
1393
                    "push %%"REG_BP"                        \n\t"
1394
                    YSCALEYUV2RGB1(%%REGBP, %5)
1395
                    YSCALEYUV2RGB1_ALPHA(%%REGBP)
1396
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1397
                    "pop %%"REG_BP"                         \n\t"
1398
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1399

    
1400
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1401
                    "a" (&c->redDither)
1402
                    );
1403
                }else{
1404
                    __asm__ volatile(
1405
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1406
                    "mov        %4, %%"REG_b"               \n\t"
1407
                    "push %%"REG_BP"                        \n\t"
1408
                    YSCALEYUV2RGB1(%%REGBP, %5)
1409
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1410
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1411
                    "pop %%"REG_BP"                         \n\t"
1412
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1413

    
1414
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1415
                    "a" (&c->redDither)
1416
                    );
1417
                }
1418
                return;
1419
            case PIX_FMT_BGR24:
1420
                __asm__ volatile(
1421
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1422
                "mov        %4, %%"REG_b"               \n\t"
1423
                "push %%"REG_BP"                        \n\t"
1424
                YSCALEYUV2RGB1(%%REGBP, %5)
1425
                "pxor    %%mm7, %%mm7                   \n\t"
1426
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1427
                "pop %%"REG_BP"                         \n\t"
1428
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1429

    
1430
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1431
                "a" (&c->redDither)
1432
                );
1433
                return;
1434
            case PIX_FMT_RGB555:
1435
                __asm__ volatile(
1436
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1437
                "mov        %4, %%"REG_b"               \n\t"
1438
                "push %%"REG_BP"                        \n\t"
1439
                YSCALEYUV2RGB1(%%REGBP, %5)
1440
                "pxor    %%mm7, %%mm7                   \n\t"
1441
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1442
#ifdef DITHER1XBPP
1443
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1444
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1445
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1446
#endif
1447
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1448
                "pop %%"REG_BP"                         \n\t"
1449
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1450

    
1451
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1452
                "a" (&c->redDither)
1453
                );
1454
                return;
1455
            case PIX_FMT_RGB565:
1456
                __asm__ volatile(
1457
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1458
                "mov        %4, %%"REG_b"               \n\t"
1459
                "push %%"REG_BP"                        \n\t"
1460
                YSCALEYUV2RGB1(%%REGBP, %5)
1461
                "pxor    %%mm7, %%mm7                   \n\t"
1462
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1463
#ifdef DITHER1XBPP
1464
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1465
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1466
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1467
#endif
1468

    
1469
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1470
                "pop %%"REG_BP"                         \n\t"
1471
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1472

    
1473
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1474
                "a" (&c->redDither)
1475
                );
1476
                return;
1477
            case PIX_FMT_YUYV422:
1478
                __asm__ volatile(
1479
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1480
                "mov        %4, %%"REG_b"               \n\t"
1481
                "push %%"REG_BP"                        \n\t"
1482
                YSCALEYUV2PACKED1(%%REGBP, %5)
1483
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1484
                "pop %%"REG_BP"                         \n\t"
1485
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1486

    
1487
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488
                "a" (&c->redDither)
1489
                );
1490
                return;
1491
            }
1492
        }
1493
        else
1494
        {
1495
            switch(dstFormat)
1496
            {
1497
            case PIX_FMT_RGB32:
1498
                if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf){
1499
                    __asm__ volatile(
1500
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                    "mov        %4, %%"REG_b"               \n\t"
1502
                    "push %%"REG_BP"                        \n\t"
1503
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1504
                    YSCALEYUV2RGB1_ALPHA(%%REGBP)
1505
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1506
                    "pop %%"REG_BP"                         \n\t"
1507
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1508

    
1509
                    :: "c" (buf0), "d" (abuf0), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1510
                    "a" (&c->redDither)
1511
                    );
1512
                }else{
1513
                    __asm__ volatile(
1514
                    "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1515
                    "mov        %4, %%"REG_b"               \n\t"
1516
                    "push %%"REG_BP"                        \n\t"
1517
                    YSCALEYUV2RGB1b(%%REGBP, %5)
1518
                    "pcmpeqd %%mm7, %%mm7                   \n\t"
1519
                    WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1520
                    "pop %%"REG_BP"                         \n\t"
1521
                    "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1522

    
1523
                    :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1524
                    "a" (&c->redDither)
1525
                    );
1526
                }
1527
                return;
1528
            case PIX_FMT_BGR24:
1529
                __asm__ volatile(
1530
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1531
                "mov        %4, %%"REG_b"               \n\t"
1532
                "push %%"REG_BP"                        \n\t"
1533
                YSCALEYUV2RGB1b(%%REGBP, %5)
1534
                "pxor    %%mm7, %%mm7                   \n\t"
1535
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1536
                "pop %%"REG_BP"                         \n\t"
1537
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1538

    
1539
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1540
                "a" (&c->redDither)
1541
                );
1542
                return;
1543
            case PIX_FMT_RGB555:
1544
                __asm__ volatile(
1545
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1546
                "mov        %4, %%"REG_b"               \n\t"
1547
                "push %%"REG_BP"                        \n\t"
1548
                YSCALEYUV2RGB1b(%%REGBP, %5)
1549
                "pxor    %%mm7, %%mm7                   \n\t"
1550
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1551
#ifdef DITHER1XBPP
1552
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1553
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1554
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1555
#endif
1556
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1557
                "pop %%"REG_BP"                         \n\t"
1558
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1559

    
1560
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1561
                "a" (&c->redDither)
1562
                );
1563
                return;
1564
            case PIX_FMT_RGB565:
1565
                __asm__ volatile(
1566
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1567
                "mov        %4, %%"REG_b"               \n\t"
1568
                "push %%"REG_BP"                        \n\t"
1569
                YSCALEYUV2RGB1b(%%REGBP, %5)
1570
                "pxor    %%mm7, %%mm7                   \n\t"
1571
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1572
#ifdef DITHER1XBPP
1573
                "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1574
                "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1575
                "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1576
#endif
1577

    
1578
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1579
                "pop %%"REG_BP"                         \n\t"
1580
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1581

    
1582
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1583
                "a" (&c->redDither)
1584
                );
1585
                return;
1586
            case PIX_FMT_YUYV422:
1587
                __asm__ volatile(
1588
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1589
                "mov        %4, %%"REG_b"               \n\t"
1590
                "push %%"REG_BP"                        \n\t"
1591
                YSCALEYUV2PACKED1b(%%REGBP, %5)
1592
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1593
                "pop %%"REG_BP"                         \n\t"
1594
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1595

    
1596
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1597
                "a" (&c->redDither)
1598
                );
1599
                return;
1600
            }
1601
        }
1602
    }
1603
#endif /* COMPILE_TEMPLATE_MMX */
1604
    if (uvalpha < 2048)
1605
    {
1606
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1607
    }else{
1608
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C(void,0), YSCALE_YUV_2_GRAY16_1_C, YSCALE_YUV_2_MONO2_C)
1609
    }
1610
}
1611

    
1612
//FIXME yuy2* can read up to 7 samples too much
1613

    
1614
static inline void RENAME(yuy2ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1615
{
1616
#if COMPILE_TEMPLATE_MMX
1617
    __asm__ volatile(
1618
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1619
    "mov                    %0, %%"REG_a"       \n\t"
1620
    "1:                                         \n\t"
1621
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1622
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1623
    "pand                %%mm2, %%mm0           \n\t"
1624
    "pand                %%mm2, %%mm1           \n\t"
1625
    "packuswb            %%mm1, %%mm0           \n\t"
1626
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1627
    "add                    $8, %%"REG_a"       \n\t"
1628
    " js                    1b                  \n\t"
1629
    : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1630
    : "%"REG_a
1631
    );
1632
#else
1633
    int i;
1634
    for (i=0; i<width; i++)
1635
        dst[i]= src[2*i];
1636
#endif
1637
}
1638

    
1639
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1640
{
1641
#if COMPILE_TEMPLATE_MMX
1642
    __asm__ volatile(
1643
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1644
    "mov                    %0, %%"REG_a"       \n\t"
1645
    "1:                                         \n\t"
1646
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1647
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1648
    "psrlw                  $8, %%mm0           \n\t"
1649
    "psrlw                  $8, %%mm1           \n\t"
1650
    "packuswb            %%mm1, %%mm0           \n\t"
1651
    "movq                %%mm0, %%mm1           \n\t"
1652
    "psrlw                  $8, %%mm0           \n\t"
1653
    "pand                %%mm4, %%mm1           \n\t"
1654
    "packuswb            %%mm0, %%mm0           \n\t"
1655
    "packuswb            %%mm1, %%mm1           \n\t"
1656
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1657
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1658
    "add                    $4, %%"REG_a"       \n\t"
1659
    " js                    1b                  \n\t"
1660
    : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1661
    : "%"REG_a
1662
    );
1663
#else
1664
    int i;
1665
    for (i=0; i<width; i++)
1666
    {
1667
        dstU[i]= src1[4*i + 1];
1668
        dstV[i]= src1[4*i + 3];
1669
    }
1670
#endif
1671
    assert(src1 == src2);
1672
}
1673

    
1674
static inline void RENAME(LEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1675
{
1676
#if COMPILE_TEMPLATE_MMX
1677
    __asm__ volatile(
1678
    "mov                    %0, %%"REG_a"       \n\t"
1679
    "1:                                         \n\t"
1680
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1681
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1682
    "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1683
    "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1684
    "psrlw                  $8, %%mm0           \n\t"
1685
    "psrlw                  $8, %%mm1           \n\t"
1686
    "psrlw                  $8, %%mm2           \n\t"
1687
    "psrlw                  $8, %%mm3           \n\t"
1688
    "packuswb            %%mm1, %%mm0           \n\t"
1689
    "packuswb            %%mm3, %%mm2           \n\t"
1690
    "movq                %%mm0, (%3, %%"REG_a") \n\t"
1691
    "movq                %%mm2, (%4, %%"REG_a") \n\t"
1692
    "add                    $8, %%"REG_a"       \n\t"
1693
    " js                    1b                  \n\t"
1694
    : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1695
    : "%"REG_a
1696
    );
1697
#else
1698
    int i;
1699
    for (i=0; i<width; i++)
1700
    {
1701
        dstU[i]= src1[2*i + 1];
1702
        dstV[i]= src2[2*i + 1];
1703
    }
1704
#endif
1705
}
1706

    
1707
/* This is almost identical to the previous, end exists only because
1708
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1709
static inline void RENAME(uyvyToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1710
{
1711
#if COMPILE_TEMPLATE_MMX
1712
    __asm__ volatile(
1713
    "mov                  %0, %%"REG_a"         \n\t"
1714
    "1:                                         \n\t"
1715
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1716
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1717
    "psrlw                $8, %%mm0             \n\t"
1718
    "psrlw                $8, %%mm1             \n\t"
1719
    "packuswb          %%mm1, %%mm0             \n\t"
1720
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1721
    "add                  $8, %%"REG_a"         \n\t"
1722
    " js                  1b                    \n\t"
1723
    : : "g" ((x86_reg)-width), "r" (src+width*2), "r" (dst+width)
1724
    : "%"REG_a
1725
    );
1726
#else
1727
    int i;
1728
    for (i=0; i<width; i++)
1729
        dst[i]= src[2*i+1];
1730
#endif
1731
}
1732

    
1733
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1734
{
1735
#if COMPILE_TEMPLATE_MMX
1736
    __asm__ volatile(
1737
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1738
    "mov                    %0, %%"REG_a"       \n\t"
1739
    "1:                                         \n\t"
1740
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1741
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1742
    "pand                %%mm4, %%mm0           \n\t"
1743
    "pand                %%mm4, %%mm1           \n\t"
1744
    "packuswb            %%mm1, %%mm0           \n\t"
1745
    "movq                %%mm0, %%mm1           \n\t"
1746
    "psrlw                  $8, %%mm0           \n\t"
1747
    "pand                %%mm4, %%mm1           \n\t"
1748
    "packuswb            %%mm0, %%mm0           \n\t"
1749
    "packuswb            %%mm1, %%mm1           \n\t"
1750
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1751
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1752
    "add                    $4, %%"REG_a"       \n\t"
1753
    " js                    1b                  \n\t"
1754
    : : "g" ((x86_reg)-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1755
    : "%"REG_a
1756
    );
1757
#else
1758
    int i;
1759
    for (i=0; i<width; i++)
1760
    {
1761
        dstU[i]= src1[4*i + 0];
1762
        dstV[i]= src1[4*i + 2];
1763
    }
1764
#endif
1765
    assert(src1 == src2);
1766
}
1767

    
1768
static inline void RENAME(BEToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1769
{
1770
#if COMPILE_TEMPLATE_MMX
1771
    __asm__ volatile(
1772
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1773
    "mov                    %0, %%"REG_a"       \n\t"
1774
    "1:                                         \n\t"
1775
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1776
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1777
    "movq    (%2, %%"REG_a",2), %%mm2           \n\t"
1778
    "movq   8(%2, %%"REG_a",2), %%mm3           \n\t"
1779
    "pand                %%mm4, %%mm0           \n\t"
1780
    "pand                %%mm4, %%mm1           \n\t"
1781
    "pand                %%mm4, %%mm2           \n\t"
1782
    "pand                %%mm4, %%mm3           \n\t"
1783
    "packuswb            %%mm1, %%mm0           \n\t"
1784
    "packuswb            %%mm3, %%mm2           \n\t"
1785
    "movq                %%mm0, (%3, %%"REG_a") \n\t"
1786
    "movq                %%mm2, (%4, %%"REG_a") \n\t"
1787
    "add                    $8, %%"REG_a"       \n\t"
1788
    " js                    1b                  \n\t"
1789
    : : "g" ((x86_reg)-width), "r" (src1+width*2), "r" (src2+width*2), "r" (dstU+width), "r" (dstV+width)
1790
    : "%"REG_a
1791
    );
1792
#else
1793
    int i;
1794
    for (i=0; i<width; i++)
1795
    {
1796
        dstU[i]= src1[2*i];
1797
        dstV[i]= src2[2*i];
1798
    }
1799
#endif
1800
}
1801

    
1802
#if COMPILE_TEMPLATE_MMX
1803
static inline void RENAME(bgr24ToY_mmx)(uint8_t *dst, const uint8_t *src, long width, int srcFormat)
1804
{
1805

    
1806
    if(srcFormat == PIX_FMT_BGR24){
1807
        __asm__ volatile(
1808
            "movq  "MANGLE(ff_bgr24toY1Coeff)", %%mm5       \n\t"
1809
            "movq  "MANGLE(ff_bgr24toY2Coeff)", %%mm6       \n\t"
1810
            :
1811
        );
1812
    }else{
1813
        __asm__ volatile(
1814
            "movq  "MANGLE(ff_rgb24toY1Coeff)", %%mm5       \n\t"
1815
            "movq  "MANGLE(ff_rgb24toY2Coeff)", %%mm6       \n\t"
1816
            :
1817
        );
1818
    }
1819

    
1820
    __asm__ volatile(
1821
        "movq  "MANGLE(ff_bgr24toYOffset)", %%mm4   \n\t"
1822
        "mov                        %2, %%"REG_a"   \n\t"
1823
        "pxor                    %%mm7, %%mm7       \n\t"
1824
        "1:                                         \n\t"
1825
        PREFETCH"               64(%0)              \n\t"
1826
        "movd                     (%0), %%mm0       \n\t"
1827
        "movd                    2(%0), %%mm1       \n\t"
1828
        "movd                    6(%0), %%mm2       \n\t"
1829
        "movd                    8(%0), %%mm3       \n\t"
1830
        "add                       $12, %0          \n\t"
1831
        "punpcklbw               %%mm7, %%mm0       \n\t"
1832
        "punpcklbw               %%mm7, %%mm1       \n\t"
1833
        "punpcklbw               %%mm7, %%mm2       \n\t"
1834
        "punpcklbw               %%mm7, %%mm3       \n\t"
1835
        "pmaddwd                 %%mm5, %%mm0       \n\t"
1836
        "pmaddwd                 %%mm6, %%mm1       \n\t"
1837
        "pmaddwd                 %%mm5, %%mm2       \n\t"
1838
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1839
        "paddd                   %%mm1, %%mm0       \n\t"
1840
        "paddd                   %%mm3, %%mm2       \n\t"
1841
        "paddd                   %%mm4, %%mm0       \n\t"
1842
        "paddd                   %%mm4, %%mm2       \n\t"
1843
        "psrad                     $15, %%mm0       \n\t"
1844
        "psrad                     $15, %%mm2       \n\t"
1845
        "packssdw                %%mm2, %%mm0       \n\t"
1846
        "packuswb                %%mm0, %%mm0       \n\t"
1847
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1848
        "add                        $4, %%"REG_a"   \n\t"
1849
        " js                        1b              \n\t"
1850
    : "+r" (src)
1851
    : "r" (dst+width), "g" ((x86_reg)-width)
1852
    : "%"REG_a
1853
    );
1854
}
1855

    
1856
static inline void RENAME(bgr24ToUV_mmx)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src, long width, int srcFormat)
1857
{
1858
    __asm__ volatile(
1859
        "movq                    24+%4, %%mm6       \n\t"
1860
        "mov                        %3, %%"REG_a"   \n\t"
1861
        "pxor                    %%mm7, %%mm7       \n\t"
1862
        "1:                                         \n\t"
1863
        PREFETCH"               64(%0)              \n\t"
1864
        "movd                     (%0), %%mm0       \n\t"
1865
        "movd                    2(%0), %%mm1       \n\t"
1866
        "punpcklbw               %%mm7, %%mm0       \n\t"
1867
        "punpcklbw               %%mm7, %%mm1       \n\t"
1868
        "movq                    %%mm0, %%mm2       \n\t"
1869
        "movq                    %%mm1, %%mm3       \n\t"
1870
        "pmaddwd                    %4, %%mm0       \n\t"
1871
        "pmaddwd                  8+%4, %%mm1       \n\t"
1872
        "pmaddwd                 16+%4, %%mm2       \n\t"
1873
        "pmaddwd                 %%mm6, %%mm3       \n\t"
1874
        "paddd                   %%mm1, %%mm0       \n\t"
1875
        "paddd                   %%mm3, %%mm2       \n\t"
1876

    
1877
        "movd                    6(%0), %%mm1       \n\t"
1878
        "movd                    8(%0), %%mm3       \n\t"
1879
        "add                       $12, %0          \n\t"
1880
        "punpcklbw               %%mm7, %%mm1       \n\t"
1881
        "punpcklbw               %%mm7, %%mm3       \n\t"
1882
        "movq                    %%mm1, %%mm4       \n\t"
1883
        "movq                    %%mm3, %%mm5       \n\t"
1884
        "pmaddwd                    %4, %%mm1       \n\t"
1885
        "pmaddwd                  8+%4, %%mm3       \n\t"
1886
        "pmaddwd                 16+%4, %%mm4       \n\t"
1887
        "pmaddwd                 %%mm6, %%mm5       \n\t"
1888
        "paddd                   %%mm3, %%mm1       \n\t"
1889
        "paddd                   %%mm5, %%mm4       \n\t"
1890

    
1891
        "movq "MANGLE(ff_bgr24toUVOffset)", %%mm3       \n\t"
1892
        "paddd                   %%mm3, %%mm0       \n\t"
1893
        "paddd                   %%mm3, %%mm2       \n\t"
1894
        "paddd                   %%mm3, %%mm1       \n\t"
1895
        "paddd                   %%mm3, %%mm4       \n\t"
1896
        "psrad                     $15, %%mm0       \n\t"
1897
        "psrad                     $15, %%mm2       \n\t"
1898
        "psrad                     $15, %%mm1       \n\t"
1899
        "psrad                     $15, %%mm4       \n\t"
1900
        "packssdw                %%mm1, %%mm0       \n\t"
1901
        "packssdw                %%mm4, %%mm2       \n\t"
1902
        "packuswb                %%mm0, %%mm0       \n\t"
1903
        "packuswb                %%mm2, %%mm2       \n\t"
1904
        "movd                %%mm0, (%1, %%"REG_a") \n\t"
1905
        "movd                %%mm2, (%2, %%"REG_a") \n\t"
1906
        "add                        $4, %%"REG_a"   \n\t"
1907
        " js                        1b              \n\t"
1908
    : "+r" (src)
1909
    : "r" (dstU+width), "r" (dstV+width), "g" ((x86_reg)-width), "m"(ff_bgr24toUV[srcFormat == PIX_FMT_RGB24][0])
1910
    : "%"REG_a
1911
    );
1912
}
1913
#endif
1914

    
1915
static inline void RENAME(bgr24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1916
{
1917
#if COMPILE_TEMPLATE_MMX
1918
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_BGR24);
1919
#else
1920
    int i;
1921
    for (i=0; i<width; i++)
1922
    {
1923
        int b= src[i*3+0];
1924
        int g= src[i*3+1];
1925
        int r= src[i*3+2];
1926

    
1927
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1928
    }
1929
#endif /* COMPILE_TEMPLATE_MMX */
1930
}
1931

    
1932
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1933
{
1934
#if COMPILE_TEMPLATE_MMX
1935
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_BGR24);
1936
#else
1937
    int i;
1938
    for (i=0; i<width; i++)
1939
    {
1940
        int b= src1[3*i + 0];
1941
        int g= src1[3*i + 1];
1942
        int r= src1[3*i + 2];
1943

    
1944
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1945
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1946
    }
1947
#endif /* COMPILE_TEMPLATE_MMX */
1948
    assert(src1 == src2);
1949
}
1950

    
1951
static inline void RENAME(bgr24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1952
{
1953
    int i;
1954
    for (i=0; i<width; i++)
1955
    {
1956
        int b= src1[6*i + 0] + src1[6*i + 3];
1957
        int g= src1[6*i + 1] + src1[6*i + 4];
1958
        int r= src1[6*i + 2] + src1[6*i + 5];
1959

    
1960
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1961
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1962
    }
1963
    assert(src1 == src2);
1964
}
1965

    
1966
static inline void RENAME(rgb24ToY)(uint8_t *dst, const uint8_t *src, long width, uint32_t *unused)
1967
{
1968
#if COMPILE_TEMPLATE_MMX
1969
    RENAME(bgr24ToY_mmx)(dst, src, width, PIX_FMT_RGB24);
1970
#else
1971
    int i;
1972
    for (i=0; i<width; i++)
1973
    {
1974
        int r= src[i*3+0];
1975
        int g= src[i*3+1];
1976
        int b= src[i*3+2];
1977

    
1978
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1979
    }
1980
#endif
1981
}
1982

    
1983
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
1984
{
1985
#if COMPILE_TEMPLATE_MMX
1986
    assert(src1==src2);
1987
    RENAME(bgr24ToUV_mmx)(dstU, dstV, src1, width, PIX_FMT_RGB24);
1988
#else
1989
    int i;
1990
    assert(src1==src2);
1991
    for (i=0; i<width; i++)
1992
    {
1993
        int r= src1[3*i + 0];
1994
        int g= src1[3*i + 1];
1995
        int b= src1[3*i + 2];
1996

    
1997
        dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1998
        dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1999
    }
2000
#endif
2001
}
2002

    
2003
static inline void RENAME(rgb24ToUV_half)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1, const uint8_t *src2, long width, uint32_t *unused)
2004
{
2005
    int i;
2006
    assert(src1==src2);
2007
    for (i=0; i<width; i++)
2008
    {
2009
        int r= src1[6*i + 0] + src1[6*i + 3];
2010
        int g= src1[6*i + 1] + src1[6*i + 4];
2011
        int b= src1[6*i + 2] + src1[6*i + 5];
2012

    
2013
        dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2014
        dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
2015
    }
2016
}
2017

    
2018

    
2019
// bilinear / bicubic scaling
2020
static inline void RENAME(hScale)(int16_t *dst, int dstW, const uint8_t *src, int srcW, int xInc,
2021
                                  const int16_t *filter, const int16_t *filterPos, long filterSize)
2022
{
2023
#if COMPILE_TEMPLATE_MMX
2024
    assert(filterSize % 4 == 0 && filterSize>0);
2025
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2026
    {
2027
        x86_reg counter= -2*dstW;
2028
        filter-= counter*2;
2029
        filterPos-= counter/2;
2030
        dst-= counter/2;
2031
        __asm__ volatile(
2032
#if defined(PIC)
2033
        "push            %%"REG_b"              \n\t"
2034
#endif
2035
        "pxor                %%mm7, %%mm7       \n\t"
2036
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2037
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2038
        ASMALIGN(4)
2039
        "1:                                     \n\t"
2040
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2041
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2042
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2043
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2044
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2045
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2046
        "punpcklbw           %%mm7, %%mm0       \n\t"
2047
        "punpcklbw           %%mm7, %%mm2       \n\t"
2048
        "pmaddwd             %%mm1, %%mm0       \n\t"
2049
        "pmaddwd             %%mm2, %%mm3       \n\t"
2050
        "movq                %%mm0, %%mm4       \n\t"
2051
        "punpckldq           %%mm3, %%mm0       \n\t"
2052
        "punpckhdq           %%mm3, %%mm4       \n\t"
2053
        "paddd               %%mm4, %%mm0       \n\t"
2054
        "psrad                  $7, %%mm0       \n\t"
2055
        "packssdw            %%mm0, %%mm0       \n\t"
2056
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2057
        "add                    $4, %%"REG_BP"  \n\t"
2058
        " jnc                   1b              \n\t"
2059

    
2060
        "pop            %%"REG_BP"              \n\t"
2061
#if defined(PIC)
2062
        "pop             %%"REG_b"              \n\t"
2063
#endif
2064
        : "+a" (counter)
2065
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2066
#if !defined(PIC)
2067
        : "%"REG_b
2068
#endif
2069
        );
2070
    }
2071
    else if (filterSize==8)
2072
    {
2073
        x86_reg counter= -2*dstW;
2074
        filter-= counter*4;
2075
        filterPos-= counter/2;
2076
        dst-= counter/2;
2077
        __asm__ volatile(
2078
#if defined(PIC)
2079
        "push             %%"REG_b"             \n\t"
2080
#endif
2081
        "pxor                 %%mm7, %%mm7      \n\t"
2082
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2083
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2084
        ASMALIGN(4)
2085
        "1:                                     \n\t"
2086
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2087
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2088
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2089
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2090
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2091
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2092
        "punpcklbw            %%mm7, %%mm0      \n\t"
2093
        "punpcklbw            %%mm7, %%mm2      \n\t"
2094
        "pmaddwd              %%mm1, %%mm0      \n\t"
2095
        "pmaddwd              %%mm2, %%mm3      \n\t"
2096

    
2097
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2098
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2099
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2100
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2101
        "punpcklbw            %%mm7, %%mm4      \n\t"
2102
        "punpcklbw            %%mm7, %%mm2      \n\t"
2103
        "pmaddwd              %%mm1, %%mm4      \n\t"
2104
        "pmaddwd              %%mm2, %%mm5      \n\t"
2105
        "paddd                %%mm4, %%mm0      \n\t"
2106
        "paddd                %%mm5, %%mm3      \n\t"
2107
        "movq                 %%mm0, %%mm4      \n\t"
2108
        "punpckldq            %%mm3, %%mm0      \n\t"
2109
        "punpckhdq            %%mm3, %%mm4      \n\t"
2110
        "paddd                %%mm4, %%mm0      \n\t"
2111
        "psrad                   $7, %%mm0      \n\t"
2112
        "packssdw             %%mm0, %%mm0      \n\t"
2113
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2114
        "add                     $4, %%"REG_BP" \n\t"
2115
        " jnc                    1b             \n\t"
2116

    
2117
        "pop             %%"REG_BP"             \n\t"
2118
#if defined(PIC)
2119
        "pop              %%"REG_b"             \n\t"
2120
#endif
2121
        : "+a" (counter)
2122
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2123
#if !defined(PIC)
2124
        : "%"REG_b
2125
#endif
2126
        );
2127
    }
2128
    else
2129
    {
2130
        uint8_t *offset = src+filterSize;
2131
        x86_reg counter= -2*dstW;
2132
        //filter-= counter*filterSize/2;
2133
        filterPos-= counter/2;
2134
        dst-= counter/2;
2135
        __asm__ volatile(
2136
        "pxor                  %%mm7, %%mm7     \n\t"
2137
        ASMALIGN(4)
2138
        "1:                                     \n\t"
2139
        "mov                      %2, %%"REG_c" \n\t"
2140
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2141
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2142
        "mov                      %5, %%"REG_c" \n\t"
2143
        "pxor                  %%mm4, %%mm4     \n\t"
2144
        "pxor                  %%mm5, %%mm5     \n\t"
2145
        "2:                                     \n\t"
2146
        "movq                   (%1), %%mm1     \n\t"
2147
        "movq               (%1, %6), %%mm3     \n\t"
2148
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2149
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2150
        "punpcklbw             %%mm7, %%mm0     \n\t"
2151
        "punpcklbw             %%mm7, %%mm2     \n\t"
2152
        "pmaddwd               %%mm1, %%mm0     \n\t"
2153
        "pmaddwd               %%mm2, %%mm3     \n\t"
2154
        "paddd                 %%mm3, %%mm5     \n\t"
2155
        "paddd                 %%mm0, %%mm4     \n\t"
2156
        "add                      $8, %1        \n\t"
2157
        "add                      $4, %%"REG_c" \n\t"
2158
        "cmp                      %4, %%"REG_c" \n\t"
2159
        " jb                      2b            \n\t"
2160
        "add                      %6, %1        \n\t"
2161
        "movq                  %%mm4, %%mm0     \n\t"
2162
        "punpckldq             %%mm5, %%mm4     \n\t"
2163
        "punpckhdq             %%mm5, %%mm0     \n\t"
2164
        "paddd                 %%mm0, %%mm4     \n\t"
2165
        "psrad                    $7, %%mm4     \n\t"
2166
        "packssdw              %%mm4, %%mm4     \n\t"
2167
        "mov                      %3, %%"REG_a" \n\t"
2168
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2169
        "add                      $4, %0        \n\t"
2170
        " jnc                     1b            \n\t"
2171

    
2172
        : "+r" (counter), "+r" (filter)
2173
        : "m" (filterPos), "m" (dst), "m"(offset),
2174
          "m" (src), "r" ((x86_reg)filterSize*2)
2175
        : "%"REG_a, "%"REG_c, "%"REG_d
2176
        );
2177
    }
2178
#else
2179
#if COMPILE_TEMPLATE_ALTIVEC
2180
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2181
#else
2182
    int i;
2183
    for (i=0; i<dstW; i++)
2184
    {
2185
        int j;
2186
        int srcPos= filterPos[i];
2187
        int val=0;
2188
        //printf("filterPos: %d\n", filterPos[i]);
2189
        for (j=0; j<filterSize; j++)
2190
        {
2191
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2192
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2193
        }
2194
        //filter += hFilterSize;
2195
        dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2196
        //dst[i] = val>>7;
2197
    }
2198
#endif /* COMPILE_ALTIVEC */
2199
#endif /* COMPILE_MMX */
2200
}
2201

    
2202
#define FAST_BILINEAR_X86 \
2203
    "subl    %%edi, %%esi    \n\t" /*  src[xx+1] - src[xx] */                   \
2204
    "imull   %%ecx, %%esi    \n\t" /* (src[xx+1] - src[xx])*xalpha */           \
2205
    "shll      $16, %%edi    \n\t"                                              \
2206
    "addl    %%edi, %%esi    \n\t" /* src[xx+1]*xalpha + src[xx]*(1-xalpha) */  \
2207
    "mov        %1, %%"REG_D"\n\t"                                              \
2208
    "shrl       $9, %%esi    \n\t"                                              \
2209

    
2210
static inline void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
2211
                                        int dstWidth, const uint8_t *src, int srcW,
2212
                                        int xInc)
2213
{
2214
    int i;
2215
    unsigned int xpos=0;
2216
    for (i=0;i<dstWidth;i++)
2217
    {
2218
        register unsigned int xx=xpos>>16;
2219
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2220
        dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2221
        xpos+=xInc;
2222
    }
2223
}
2224

    
2225
      // *** horizontal scale Y line to temp buffer
2226
static inline void RENAME(hyscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src, int srcW, int xInc,
2227
                                   int flags, const int16_t *hLumFilter,
2228
                                   const int16_t *hLumFilterPos, int hLumFilterSize,
2229
                                   int srcFormat, uint8_t *formatConvBuffer,
2230
                                   uint32_t *pal, int isAlpha)
2231
{
2232
    int32_t av_unused *mmx2FilterPos = c->lumMmx2FilterPos;
2233
    int16_t av_unused *mmx2Filter    = c->lumMmx2Filter;
2234
    int     av_unused canMMX2BeUsed  = c->canMMX2BeUsed;
2235
    void    av_unused *mmx2FilterCode= c->lumMmx2FilterCode;
2236
    void (*internal_func)(uint8_t *, const uint8_t *, long, uint32_t *) = isAlpha ? c->hascale_internal : c->hyscale_internal;
2237

    
2238
    if (isAlpha) {
2239
        if (srcFormat == PIX_FMT_RGB32   || srcFormat == PIX_FMT_BGR32  )
2240
            src += 3;
2241
    } else {
2242
        if (srcFormat == PIX_FMT_RGB32_1 || srcFormat == PIX_FMT_BGR32_1)
2243
            src += ALT32_CORR;
2244
    }
2245

    
2246
    if (srcFormat == PIX_FMT_RGB48LE)
2247
        src++;
2248

    
2249
    if (internal_func) {
2250
        internal_func(formatConvBuffer, src, srcW, pal);
2251
        src= formatConvBuffer;
2252
    }
2253

    
2254
#if COMPILE_TEMPLATE_MMX
2255
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2256
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2257
#else
2258
    if (!(flags&SWS_FAST_BILINEAR))
2259
#endif
2260
    {
2261
        c->hScale(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2262
    }
2263
    else // fast bilinear upscale / crap downscale
2264
    {
2265
#if ARCH_X86 && CONFIG_GPL
2266
#if COMPILE_TEMPLATE_MMX2
2267
        int i;
2268
#if defined(PIC)
2269
        DECLARE_ALIGNED(8, uint64_t, ebxsave);
2270
#endif
2271
        if (canMMX2BeUsed)
2272
        {
2273
            __asm__ volatile(
2274
#if defined(PIC)
2275
            "mov               %%"REG_b", %5        \n\t"
2276
#endif
2277
            "pxor                  %%mm7, %%mm7     \n\t"
2278
            "mov                      %0, %%"REG_c" \n\t"
2279
            "mov                      %1, %%"REG_D" \n\t"
2280
            "mov                      %2, %%"REG_d" \n\t"
2281
            "mov                      %3, %%"REG_b" \n\t"
2282
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2283
            PREFETCH"        (%%"REG_c")            \n\t"
2284
            PREFETCH"      32(%%"REG_c")            \n\t"
2285
            PREFETCH"      64(%%"REG_c")            \n\t"
2286

    
2287
#if ARCH_X86_64
2288

    
2289
#define CALL_MMX2_FILTER_CODE \
2290
            "movl            (%%"REG_b"), %%esi     \n\t"\
2291
            "call                    *%4            \n\t"\
2292
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2293
            "add               %%"REG_S", %%"REG_c" \n\t"\
2294
            "add               %%"REG_a", %%"REG_D" \n\t"\
2295
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2296

    
2297
#else
2298

    
2299
#define CALL_MMX2_FILTER_CODE \
2300
            "movl (%%"REG_b"), %%esi        \n\t"\
2301
            "call         *%4                       \n\t"\
2302
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2303
            "add               %%"REG_a", %%"REG_D" \n\t"\
2304
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2305

    
2306
#endif /* ARCH_X86_64 */
2307

    
2308
CALL_MMX2_FILTER_CODE
2309
CALL_MMX2_FILTER_CODE
2310
CALL_MMX2_FILTER_CODE
2311
CALL_MMX2_FILTER_CODE
2312
CALL_MMX2_FILTER_CODE
2313
CALL_MMX2_FILTER_CODE
2314
CALL_MMX2_FILTER_CODE
2315
CALL_MMX2_FILTER_CODE
2316

    
2317
#if defined(PIC)
2318
            "mov                      %5, %%"REG_b" \n\t"
2319
#endif
2320
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2321
            "m" (mmx2FilterCode)
2322
#if defined(PIC)
2323
            ,"m" (ebxsave)
2324
#endif
2325
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2326
#if !defined(PIC)
2327
            ,"%"REG_b
2328
#endif
2329
            );
2330
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2331
        }
2332
        else
2333
        {
2334
#endif /* COMPILE_TEMPLATE_MMX2 */
2335
        x86_reg xInc_shr16 = xInc >> 16;
2336
        uint16_t xInc_mask = xInc & 0xffff;
2337
        //NO MMX just normal asm ...
2338
        __asm__ volatile(
2339
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2340
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2341
        "xorl    %%ecx, %%ecx                \n\t" // xalpha
2342
        ASMALIGN(4)
2343
        "1:                                  \n\t"
2344
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2345
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2346
        FAST_BILINEAR_X86
2347
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2348
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2349
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2350

    
2351
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2352
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2353
        FAST_BILINEAR_X86
2354
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2355
        "addw       %4, %%cx                 \n\t" //xalpha += xInc&0xFFFF
2356
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>16 + carry
2357

    
2358

    
2359
        "add        $2, %%"REG_a"            \n\t"
2360
        "cmp        %2, %%"REG_a"            \n\t"
2361
        " jb        1b                       \n\t"
2362

    
2363

    
2364
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2365
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2366
        );
2367
#if COMPILE_TEMPLATE_MMX2
2368
        } //if MMX2 can't be used
2369
#endif
2370
#else
2371
        c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2372
#endif /* ARCH_X86 */
2373
    }
2374

    
2375
    if(!isAlpha && c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2376
        int i;
2377
        //FIXME all pal and rgb srcFormats could do this convertion as well
2378
        //FIXME all scalers more complex than bilinear could do half of this transform
2379
        if(c->srcRange){
2380
            for (i=0; i<dstWidth; i++)
2381
                dst[i]= (dst[i]*14071 + 33561947)>>14;
2382
        }else{
2383
            for (i=0; i<dstWidth; i++)
2384
                dst[i]= (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2385
        }
2386
    }
2387
}
2388

    
2389
static inline void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst,
2390
                                        int dstWidth, const uint8_t *src1,
2391
                                        const uint8_t *src2, int srcW, int xInc)
2392
{
2393
    int i;
2394
    unsigned int xpos=0;
2395
    for (i=0;i<dstWidth;i++)
2396
    {
2397
        register unsigned int xx=xpos>>16;
2398
        register unsigned int xalpha=(xpos&0xFFFF)>>9;
2399
        dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2400
        dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2401
        /* slower
2402
        dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2403
        dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2404
        */
2405
        xpos+=xInc;
2406
    }
2407
}
2408

    
2409
inline static void RENAME(hcscale)(SwsContext *c, uint16_t *dst, long dstWidth, const uint8_t *src1, const uint8_t *src2,
2410
                                   int srcW, int xInc, int flags, const int16_t *hChrFilter,
2411
                                   const int16_t *hChrFilterPos, int hChrFilterSize,
2412
                                   int srcFormat, uint8_t *formatConvBuffer,
2413
                                   uint32_t *pal)
2414
{
2415
    int32_t av_unused *mmx2FilterPos = c->chrMmx2FilterPos;
2416
    int16_t av_unused *mmx2Filter    = c->chrMmx2Filter;
2417
    int     av_unused canMMX2BeUsed  = c->canMMX2BeUsed;
2418
    void    av_unused *mmx2FilterCode= c->chrMmx2FilterCode;
2419

    
2420
    if (isGray(srcFormat) || srcFormat==PIX_FMT_MONOBLACK || srcFormat==PIX_FMT_MONOWHITE)
2421
        return;
2422

    
2423
    if (srcFormat==PIX_FMT_RGB32_1 || srcFormat==PIX_FMT_BGR32_1) {
2424
        src1 += ALT32_CORR;
2425
        src2 += ALT32_CORR;
2426
    }
2427

    
2428
    if (srcFormat==PIX_FMT_RGB48LE) {
2429
        src1++;
2430
        src2++;
2431
    }
2432

    
2433
    if (c->hcscale_internal) {
2434
        c->hcscale_internal(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, pal);
2435
        src1= formatConvBuffer;
2436
        src2= formatConvBuffer+VOFW;
2437
    }
2438

    
2439
#if COMPILE_TEMPLATE_MMX
2440
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2441
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2442
#else
2443
    if (!(flags&SWS_FAST_BILINEAR))
2444
#endif
2445
    {
2446
        c->hScale(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2447
        c->hScale(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2448
    }
2449
    else // fast bilinear upscale / crap downscale
2450
    {
2451
#if ARCH_X86 && CONFIG_GPL
2452
#if COMPILE_TEMPLATE_MMX2
2453
        int i;
2454
#if defined(PIC)
2455
        DECLARE_ALIGNED(8, uint64_t, ebxsave);
2456
#endif
2457
        if (canMMX2BeUsed)
2458
        {
2459
            __asm__ volatile(
2460
#if defined(PIC)
2461
            "mov          %%"REG_b", %6         \n\t"
2462
#endif
2463
            "pxor             %%mm7, %%mm7      \n\t"
2464
            "mov                 %0, %%"REG_c"  \n\t"
2465
            "mov                 %1, %%"REG_D"  \n\t"
2466
            "mov                 %2, %%"REG_d"  \n\t"
2467
            "mov                 %3, %%"REG_b"  \n\t"
2468
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2469
            PREFETCH"   (%%"REG_c")             \n\t"
2470
            PREFETCH" 32(%%"REG_c")             \n\t"
2471
            PREFETCH" 64(%%"REG_c")             \n\t"
2472

    
2473
CALL_MMX2_FILTER_CODE
2474
CALL_MMX2_FILTER_CODE
2475
CALL_MMX2_FILTER_CODE
2476
CALL_MMX2_FILTER_CODE
2477
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2478
            "mov                 %5, %%"REG_c"  \n\t" // src
2479
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2480
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2481
            PREFETCH"   (%%"REG_c")             \n\t"
2482
            PREFETCH" 32(%%"REG_c")             \n\t"
2483
            PREFETCH" 64(%%"REG_c")             \n\t"
2484

    
2485
CALL_MMX2_FILTER_CODE
2486
CALL_MMX2_FILTER_CODE
2487
CALL_MMX2_FILTER_CODE
2488
CALL_MMX2_FILTER_CODE
2489

    
2490
#if defined(PIC)
2491
            "mov %6, %%"REG_b"    \n\t"
2492
#endif
2493
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2494
            "m" (mmx2FilterCode), "m" (src2)
2495
#if defined(PIC)
2496
            ,"m" (ebxsave)
2497
#endif
2498
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2499
#if !defined(PIC)
2500
             ,"%"REG_b
2501
#endif
2502
            );
2503
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2504
            {
2505
                //printf("%d %d %d\n", dstWidth, i, srcW);
2506
                dst[i] = src1[srcW-1]*128;
2507
                dst[i+VOFW] = src2[srcW-1]*128;
2508
            }
2509
        }
2510
        else
2511
        {
2512
#endif /* COMPILE_TEMPLATE_MMX2 */
2513
            x86_reg xInc_shr16 = (x86_reg) (xInc >> 16);
2514
            uint16_t xInc_mask = xInc & 0xffff;
2515
            __asm__ volatile(
2516
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2517
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2518
            "xorl    %%ecx, %%ecx                   \n\t" // xalpha
2519
            ASMALIGN(4)
2520
            "1:                                     \n\t"
2521
            "mov        %0, %%"REG_S"               \n\t"
2522
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2523
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2524
            FAST_BILINEAR_X86
2525
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2526

    
2527
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2528
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2529
            FAST_BILINEAR_X86
2530
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2531

    
2532
            "addw       %4, %%cx                    \n\t" //xalpha += xInc&0xFFFF
2533
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>16 + carry
2534
            "add        $1, %%"REG_a"               \n\t"
2535
            "cmp        %2, %%"REG_a"               \n\t"
2536
            " jb        1b                          \n\t"
2537

    
2538
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2539
   which is needed to support GCC 4.0. */
2540
#if ARCH_X86_64 && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2541
            :: "m" (src1), "m" (dst), "g" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2542
#else
2543
            :: "m" (src1), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2544
#endif
2545
            "r" (src2)
2546
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2547
            );
2548
#if COMPILE_TEMPLATE_MMX2
2549
        } //if MMX2 can't be used
2550
#endif
2551
#else
2552
        c->hcscale_fast(c, dst, dstWidth, src1, src2, srcW, xInc);
2553
#endif /* ARCH_X86 */
2554
    }
2555
    if(c->srcRange != c->dstRange && !(isRGB(c->dstFormat) || isBGR(c->dstFormat))){
2556
        int i;
2557
        //FIXME all pal and rgb srcFormats could do this convertion as well
2558
        //FIXME all scalers more complex than bilinear could do half of this transform
2559
        if(c->srcRange){
2560
            for (i=0; i<dstWidth; i++){
2561
                dst[i     ]= (dst[i     ]*1799 + 4081085)>>11; //1469
2562
                dst[i+VOFW]= (dst[i+VOFW]*1799 + 4081085)>>11; //1469
2563
            }
2564
        }else{
2565
            for (i=0; i<dstWidth; i++){
2566
                dst[i     ]= (FFMIN(dst[i     ],30775)*4663 - 9289992)>>12; //-264
2567
                dst[i+VOFW]= (FFMIN(dst[i+VOFW],30775)*4663 - 9289992)>>12; //-264
2568
            }
2569
        }
2570
    }
2571
}
2572

    
2573
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2574
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2575

    
2576
    /* load a few things into local vars to make the code more readable? and faster */
2577
    const int srcW= c->srcW;
2578
    const int dstW= c->dstW;
2579
    const int dstH= c->dstH;
2580
    const int chrDstW= c->chrDstW;
2581
    const int chrSrcW= c->chrSrcW;
2582
    const int lumXInc= c->lumXInc;
2583
    const int chrXInc= c->chrXInc;
2584
    const int dstFormat= c->dstFormat;
2585
    const int srcFormat= c->srcFormat;
2586
    const int flags= c->flags;
2587
    int16_t *vLumFilterPos= c->vLumFilterPos;
2588
    int16_t *vChrFilterPos= c->vChrFilterPos;
2589
    int16_t *hLumFilterPos= c->hLumFilterPos;
2590
    int16_t *hChrFilterPos= c->hChrFilterPos;
2591
    int16_t *vLumFilter= c->vLumFilter;
2592
    int16_t *vChrFilter= c->vChrFilter;
2593
    int16_t *hLumFilter= c->hLumFilter;
2594
    int16_t *hChrFilter= c->hChrFilter;
2595
    int32_t *lumMmxFilter= c->lumMmxFilter;
2596
    int32_t *chrMmxFilter= c->chrMmxFilter;
2597
    int32_t *alpMmxFilter= c->alpMmxFilter;
2598
    const int vLumFilterSize= c->vLumFilterSize;
2599
    const int vChrFilterSize= c->vChrFilterSize;
2600
    const int hLumFilterSize= c->hLumFilterSize;
2601
    const int hChrFilterSize= c->hChrFilterSize;
2602
    int16_t **lumPixBuf= c->lumPixBuf;
2603
    int16_t **chrPixBuf= c->chrPixBuf;
2604
    int16_t **alpPixBuf= c->alpPixBuf;
2605
    const int vLumBufSize= c->vLumBufSize;
2606
    const int vChrBufSize= c->vChrBufSize;
2607
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2608
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2609
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2610
    int lastDstY;
2611
    uint32_t *pal=c->pal_yuv;
2612

    
2613
    /* vars which will change and which we need to store back in the context */
2614
    int dstY= c->dstY;
2615
    int lumBufIndex= c->lumBufIndex;
2616
    int chrBufIndex= c->chrBufIndex;
2617
    int lastInLumBuf= c->lastInLumBuf;
2618
    int lastInChrBuf= c->lastInChrBuf;
2619

    
2620
    if (isPacked(c->srcFormat)){
2621
        src[0]=
2622
        src[1]=
2623
        src[2]=
2624
        src[3]= src[0];
2625
        srcStride[0]=
2626
        srcStride[1]=
2627
        srcStride[2]=
2628
        srcStride[3]= srcStride[0];
2629
    }
2630
    srcStride[1]<<= c->vChrDrop;
2631
    srcStride[2]<<= c->vChrDrop;
2632

    
2633
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2634
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2635

    
2636
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2637
    //dstStride[0],dstStride[1],dstStride[2]);
2638

    
2639
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0)
2640
    {
2641
        static int warnedAlready=0; //FIXME move this into the context perhaps
2642
        if (flags & SWS_PRINT_INFO && !warnedAlready)
2643
        {
2644
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2645
                   "         ->cannot do aligned memory accesses anymore\n");
2646
            warnedAlready=1;
2647
        }
2648
    }
2649

    
2650
    /* Note the user might start scaling the picture in the middle so this
2651
       will not get executed. This is not really intended but works
2652
       currently, so people might do it. */
2653
    if (srcSliceY ==0){
2654
        lumBufIndex=0;
2655
        chrBufIndex=0;
2656
        dstY=0;
2657
        lastInLumBuf= -1;
2658
        lastInChrBuf= -1;
2659
    }
2660

    
2661
    lastDstY= dstY;
2662

    
2663
    for (;dstY < dstH; dstY++){
2664
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
2665
        const int chrDstY= dstY>>c->chrDstVSubSample;
2666
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
2667
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
2668
        unsigned char *aDest=(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3]+dstStride[3]*dstY : NULL;
2669

    
2670
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2671
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2672
        int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2673
        int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2674
        int enough_lines;
2675

    
2676
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
2677
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
2678
        //handle holes (FAST_BILINEAR & weird filters)
2679
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2680
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2681
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
2682
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2683
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2684

    
2685
        // Do we have enough lines in this slice to output the dstY line
2686
        enough_lines = lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2687
        if (!enough_lines) {
2688
            lastLumSrcY = srcSliceY + srcSliceH - 1;
2689
            lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2690
        }
2691

    
2692
        /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
2693
        firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
2694
        lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
2695
        vChrBufSize, vLumBufSize);*/
2696

    
2697
        //Do horizontal scaling
2698
        while(lastInLumBuf < lastLumSrcY)
2699
        {
2700
            uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2701
            uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2702
            lumBufIndex++;
2703
            //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
2704
            assert(lumBufIndex < 2*vLumBufSize);
2705
            assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2706
            assert(lastInLumBuf + 1 - srcSliceY >= 0);
2707
            //printf("%d %d\n", lumBufIndex, vLumBufSize);
2708
            RENAME(hyscale)(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2709
                            flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2710
                            c->srcFormat, formatConvBuffer,
2711
                            pal, 0);
2712
            if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2713
                RENAME(hyscale)(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW, lumXInc,
2714
                                flags, hLumFilter, hLumFilterPos, hLumFilterSize,
2715
                                c->srcFormat, formatConvBuffer,
2716
                                pal, 1);
2717
            lastInLumBuf++;
2718
        }
2719
        while(lastInChrBuf < lastChrSrcY)
2720
        {
2721
            uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2722
            uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2723
            chrBufIndex++;
2724
            assert(chrBufIndex < 2*vChrBufSize);
2725
            assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2726
            assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2727
            //FIXME replace parameters through context struct (some at least)
2728

    
2729
            if (!(isGray(srcFormat) || isGray(dstFormat)))
2730
                RENAME(hcscale)(c, chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
2731
                                flags, hChrFilter, hChrFilterPos, hChrFilterSize,
2732
                                c->srcFormat, formatConvBuffer,
2733
                                pal);
2734
            lastInChrBuf++;
2735
        }
2736
        //wrap buf index around to stay inside the ring buffer
2737
        if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2738
        if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2739
        if (!enough_lines)
2740
            break; //we can't output a dstY line so let's try with the next slice
2741

    
2742
#if COMPILE_TEMPLATE_MMX
2743
        c->blueDither= ff_dither8[dstY&1];
2744
        if (c->dstFormat == PIX_FMT_RGB555 || c->dstFormat == PIX_FMT_BGR555)
2745
            c->greenDither= ff_dither8[dstY&1];
2746
        else
2747
            c->greenDither= ff_dither4[dstY&1];
2748
        c->redDither= ff_dither8[(dstY+1)&1];
2749
#endif
2750
        if (dstY < dstH-2)
2751
        {
2752
            const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2753
            const int16_t **chrSrcPtr= (const int16_t **) chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2754
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2755
#if COMPILE_TEMPLATE_MMX
2756
            int i;
2757
        if (flags & SWS_ACCURATE_RND){
2758
            int s= APCK_SIZE / 8;
2759
            for (i=0; i<vLumFilterSize; i+=2){
2760
                *(void**)&lumMmxFilter[s*i              ]= lumSrcPtr[i  ];
2761
                *(void**)&lumMmxFilter[s*i+APCK_PTR2/4  ]= lumSrcPtr[i+(vLumFilterSize>1)];
2762
                          lumMmxFilter[s*i+APCK_COEF/4  ]=
2763
                          lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i    ]
2764
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
2765
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2766
                    *(void**)&alpMmxFilter[s*i              ]= alpSrcPtr[i  ];
2767
                    *(void**)&alpMmxFilter[s*i+APCK_PTR2/4  ]= alpSrcPtr[i+(vLumFilterSize>1)];
2768
                              alpMmxFilter[s*i+APCK_COEF/4  ]=
2769
                              alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4  ];
2770
                }
2771
            }
2772
            for (i=0; i<vChrFilterSize; i+=2){
2773
                *(void**)&chrMmxFilter[s*i              ]= chrSrcPtr[i  ];
2774
                *(void**)&chrMmxFilter[s*i+APCK_PTR2/4  ]= chrSrcPtr[i+(vChrFilterSize>1)];
2775
                          chrMmxFilter[s*i+APCK_COEF/4  ]=
2776
                          chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i    ]
2777
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
2778
            }
2779
        }else{
2780
            for (i=0; i<vLumFilterSize; i++)
2781
            {
2782
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
2783
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
2784
                lumMmxFilter[4*i+2]=
2785
                lumMmxFilter[4*i+3]=
2786
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
2787
                if (CONFIG_SWSCALE_ALPHA && alpPixBuf){
2788
                    alpMmxFilter[4*i+0]= (int32_t)alpSrcPtr[i];
2789
                    alpMmxFilter[4*i+1]= (uint64_t)alpSrcPtr[i] >> 32;
2790
                    alpMmxFilter[4*i+2]=
2791
                    alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
2792
                }
2793
            }
2794
            for (i=0; i<vChrFilterSize; i++)
2795
            {
2796
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
2797
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
2798
                chrMmxFilter[4*i+2]=
2799
                chrMmxFilter[4*i+3]=
2800
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
2801
            }
2802
        }
2803
#endif
2804
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2805
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2806
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2807
                c->yuv2nv12X(c,
2808
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2809
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2810
                    dest, uDest, dstW, chrDstW, dstFormat);
2811
            }
2812
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12 like
2813
            {
2814
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2815
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2816
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
2817
                {
2818
                    int16_t *lumBuf = lumPixBuf[0];
2819
                    int16_t *chrBuf= chrPixBuf[0];
2820
                    int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpPixBuf[0] : NULL;
2821
                    c->yuv2yuv1(c, lumBuf, chrBuf, alpBuf, dest, uDest, vDest, aDest, dstW, chrDstW);
2822
                }
2823
                else //General YV12
2824
                {
2825
                    c->yuv2yuvX(c,
2826
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2827
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2828
                        alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2829
                }
2830
            }
2831
            else
2832
            {
2833
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2834
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2835
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
2836
                {
2837
                    int chrAlpha= vChrFilter[2*dstY+1];
2838
                    if(flags & SWS_FULL_CHR_H_INT){
2839
                        yuv2rgbXinC_full(c, //FIXME write a packed1_full function
2840
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2841
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2842
                            alpSrcPtr, dest, dstW, dstY);
2843
                    }else{
2844
                        c->yuv2packed1(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
2845
                            alpPixBuf ? *alpSrcPtr : NULL,
2846
                            dest, dstW, chrAlpha, dstFormat, flags, dstY);
2847
                    }
2848
                }
2849
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
2850
                {
2851
                    int lumAlpha= vLumFilter[2*dstY+1];
2852
                    int chrAlpha= vChrFilter[2*dstY+1];
2853
                    lumMmxFilter[2]=
2854
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
2855
                    chrMmxFilter[2]=
2856
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
2857
                    if(flags & SWS_FULL_CHR_H_INT){
2858
                        yuv2rgbXinC_full(c, //FIXME write a packed2_full function
2859
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2860
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2861
                            alpSrcPtr, dest, dstW, dstY);
2862
                    }else{
2863
                        c->yuv2packed2(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
2864
                            alpPixBuf ? *alpSrcPtr : NULL, alpPixBuf ? *(alpSrcPtr+1) : NULL,
2865
                            dest, dstW, lumAlpha, chrAlpha, dstY);
2866
                    }
2867
                }
2868
                else //general RGB
2869
                {
2870
                    if(flags & SWS_FULL_CHR_H_INT){
2871
                        yuv2rgbXinC_full(c,
2872
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2873
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2874
                            alpSrcPtr, dest, dstW, dstY);
2875
                    }else{
2876
                        c->yuv2packedX(c,
2877
                            vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2878
                            vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2879
                            alpSrcPtr, dest, dstW, dstY);
2880
                    }
2881
                }
2882
            }
2883
        }
2884
        else // hmm looks like we can't use MMX here without overwriting this array's tail
2885
        {
2886
            const int16_t **lumSrcPtr= (const int16_t **)lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2887
            const int16_t **chrSrcPtr= (const int16_t **)chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2888
            const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2889
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
2890
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2891
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
2892
                yuv2nv12XinC(
2893
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2894
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2895
                    dest, uDest, dstW, chrDstW, dstFormat);
2896
            }
2897
            else if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) //YV12
2898
            {
2899
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2900
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
2901
                yuv2yuvXinC(
2902
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
2903
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2904
                    alpSrcPtr, dest, uDest, vDest, aDest, dstW, chrDstW);
2905
            }
2906
            else
2907
            {
2908
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
2909
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
2910
                if(flags & SWS_FULL_CHR_H_INT){
2911
                    yuv2rgbXinC_full(c,
2912
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2913
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2914
                        alpSrcPtr, dest, dstW, dstY);
2915
                }else{
2916
                    yuv2packedXinC(c,
2917
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
2918
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
2919
                        alpSrcPtr, dest, dstW, dstY);
2920
                }
2921
            }
2922
        }
2923
    }
2924

    
2925
    if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2926
        fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2927

    
2928
#if COMPILE_TEMPLATE_MMX
2929
    if (flags & SWS_CPU_CAPS_MMX2 )  __asm__ volatile("sfence":::"memory");
2930
    /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
2931
    if (flags & SWS_CPU_CAPS_3DNOW)  __asm__ volatile("femms" :::"memory");
2932
    else                             __asm__ volatile("emms"  :::"memory");
2933
#endif
2934
    /* store changed local vars back in the context */
2935
    c->dstY= dstY;
2936
    c->lumBufIndex= lumBufIndex;
2937
    c->chrBufIndex= chrBufIndex;
2938
    c->lastInLumBuf= lastInLumBuf;
2939
    c->lastInChrBuf= lastInChrBuf;
2940

    
2941
    return dstY - lastDstY;
2942
}
2943

    
2944
static void RENAME(sws_init_swScale)(SwsContext *c)
2945
{
2946
    enum PixelFormat srcFormat = c->srcFormat;
2947

    
2948
    c->yuv2nv12X    = RENAME(yuv2nv12X   );
2949
    c->yuv2yuv1     = RENAME(yuv2yuv1    );
2950
    c->yuv2yuvX     = RENAME(yuv2yuvX    );
2951
    c->yuv2packed1  = RENAME(yuv2packed1 );
2952
    c->yuv2packed2  = RENAME(yuv2packed2 );
2953
    c->yuv2packedX  = RENAME(yuv2packedX );
2954

    
2955
    c->hScale       = RENAME(hScale      );
2956

    
2957
    c->hyscale_fast = RENAME(hyscale_fast);
2958
    c->hcscale_fast = RENAME(hcscale_fast);
2959

    
2960
    c->hcscale_internal = NULL;
2961
    switch(srcFormat) {
2962
        case PIX_FMT_YUYV422  : c->hcscale_internal = RENAME(yuy2ToUV); break;
2963
        case PIX_FMT_UYVY422  : c->hcscale_internal = RENAME(uyvyToUV); break;
2964
        case PIX_FMT_RGB8     :
2965
        case PIX_FMT_BGR8     :
2966
        case PIX_FMT_PAL8     :
2967
        case PIX_FMT_BGR4_BYTE:
2968
        case PIX_FMT_RGB4_BYTE: c->hcscale_internal = palToUV; break;
2969
        case PIX_FMT_YUV420PBE:
2970
        case PIX_FMT_YUV422PBE:
2971
        case PIX_FMT_YUV444PBE: c->hcscale_internal = RENAME(BEToUV); break;
2972
        case PIX_FMT_YUV420PLE:
2973
        case PIX_FMT_YUV422PLE:
2974
        case PIX_FMT_YUV444PLE: c->hcscale_internal = RENAME(LEToUV); break;
2975
    }
2976
    if (c->chrSrcHSubSample) {
2977
        switch(srcFormat) {
2978
        case PIX_FMT_RGB48BE:
2979
        case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV_half; break;
2980
        case PIX_FMT_RGB32  :
2981
        case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV_half; break;
2982
        case PIX_FMT_BGR24  : c->hcscale_internal = RENAME(bgr24ToUV_half); break;
2983
        case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV_half; break;
2984
        case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV_half; break;
2985
        case PIX_FMT_BGR32  :
2986
        case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV_half; break;
2987
        case PIX_FMT_RGB24  : c->hcscale_internal = RENAME(rgb24ToUV_half); break;
2988
        case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV_half; break;
2989
        case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV_half; break;
2990
        }
2991
    } else {
2992
        switch(srcFormat) {
2993
        case PIX_FMT_RGB48BE:
2994
        case PIX_FMT_RGB48LE: c->hcscale_internal = rgb48ToUV; break;
2995
        case PIX_FMT_RGB32  :
2996
        case PIX_FMT_RGB32_1: c->hcscale_internal = bgr32ToUV; break;
2997
        case PIX_FMT_BGR24  : c->hcscale_internal = RENAME(bgr24ToUV); break;
2998
        case PIX_FMT_BGR565 : c->hcscale_internal = bgr16ToUV; break;
2999
        case PIX_FMT_BGR555 : c->hcscale_internal = bgr15ToUV; break;
3000
        case PIX_FMT_BGR32  :
3001
        case PIX_FMT_BGR32_1: c->hcscale_internal = rgb32ToUV; break;
3002
        case PIX_FMT_RGB24  : c->hcscale_internal = RENAME(rgb24ToUV); break;
3003
        case PIX_FMT_RGB565 : c->hcscale_internal = rgb16ToUV; break;
3004
        case PIX_FMT_RGB555 : c->hcscale_internal = rgb15ToUV; break;
3005
        }
3006
    }
3007

    
3008
    c->hyscale_internal = NULL;
3009
    c->hascale_internal = NULL;
3010
    switch (srcFormat) {
3011
    case PIX_FMT_YUYV422  :
3012
    case PIX_FMT_YUV420PBE:
3013
    case PIX_FMT_YUV422PBE:
3014
    case PIX_FMT_YUV444PBE:
3015
    case PIX_FMT_GRAY16BE : c->hyscale_internal = RENAME(yuy2ToY); break;
3016
    case PIX_FMT_UYVY422  :
3017
    case PIX_FMT_YUV420PLE:
3018
    case PIX_FMT_YUV422PLE:
3019
    case PIX_FMT_YUV444PLE:
3020
    case PIX_FMT_GRAY16LE : c->hyscale_internal = RENAME(uyvyToY); break;
3021
    case PIX_FMT_BGR24    : c->hyscale_internal = RENAME(bgr24ToY); break;
3022
    case PIX_FMT_BGR565   : c->hyscale_internal = bgr16ToY; break;
3023
    case PIX_FMT_BGR555   : c->hyscale_internal = bgr15ToY; break;
3024
    case PIX_FMT_RGB24    : c->hyscale_internal = RENAME(rgb24ToY); break;
3025
    case PIX_FMT_RGB565   : c->hyscale_internal = rgb16ToY; break;
3026
    case PIX_FMT_RGB555   : c->hyscale_internal = rgb15ToY; break;
3027
    case PIX_FMT_RGB8     :
3028
    case PIX_FMT_BGR8     :
3029
    case PIX_FMT_PAL8     :
3030
    case PIX_FMT_BGR4_BYTE:
3031
    case PIX_FMT_RGB4_BYTE: c->hyscale_internal = palToY; break;
3032
    case PIX_FMT_MONOBLACK: c->hyscale_internal = monoblack2Y; break;
3033
    case PIX_FMT_MONOWHITE: c->hyscale_internal = monowhite2Y; break;
3034
    case PIX_FMT_RGB32  :
3035
    case PIX_FMT_RGB32_1: c->hyscale_internal = bgr32ToY; break;
3036
    case PIX_FMT_BGR32  :
3037
    case PIX_FMT_BGR32_1: c->hyscale_internal = rgb32ToY; break;
3038
    case PIX_FMT_RGB48BE:
3039
    case PIX_FMT_RGB48LE: c->hyscale_internal = rgb48ToY; break;
3040
    }
3041
    if (c->alpPixBuf) {
3042
        switch (srcFormat) {
3043
        case PIX_FMT_RGB32  :
3044
        case PIX_FMT_RGB32_1:
3045
        case PIX_FMT_BGR32  :
3046
        case PIX_FMT_BGR32_1: c->hascale_internal = abgrToA; break;
3047
        }
3048
    }
3049
}