Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 9990e426

History | View | Annotate | Download (135 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                                $16, %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddw              %%mm7, %%mm0      \n\t"\
194
    "paddw              %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX \
210
    asm volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232
\
233
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236
    "movq                      %%mm1, %%mm7         \n\t"\
237
    ASMALIGN(4)\
238
    "2:                                             \n\t"\
239
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242
    "add                         $16, %%"REG_d"            \n\t"\
243
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244
    "pmulhw                    %%mm0, %%mm2         \n\t"\
245
    "pmulhw                    %%mm0, %%mm5         \n\t"\
246
    "paddw                     %%mm2, %%mm1         \n\t"\
247
    "paddw                     %%mm5, %%mm7         \n\t"\
248
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
249
    " jnz                         2b                \n\t"\
250

    
251
#define YSCALEYUV2PACKEDX_END                 \
252
    :: "r" (&c->redDither),                   \
253
        "m" (dummy), "m" (dummy), "m" (dummy),\
254
        "r" (dest), "m" (dstW)                \
255
    : "%"REG_a, "%"REG_d, "%"REG_S            \
256
    );
257

    
258
#define YSCALEYUV2PACKEDX_ACCURATE \
259
    asm volatile(\
260
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
261
    ASMALIGN(4)\
262
    "nop                                            \n\t"\
263
    "1:                                             \n\t"\
264
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266
    "pxor                      %%mm4, %%mm4         \n\t"\
267
    "pxor                      %%mm5, %%mm5         \n\t"\
268
    "pxor                      %%mm6, %%mm6         \n\t"\
269
    "pxor                      %%mm7, %%mm7         \n\t"\
270
    ASMALIGN(4)\
271
    "2:                                             \n\t"\
272
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
275
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276
    "movq                      %%mm0, %%mm3         \n\t"\
277
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
278
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
279
    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
280
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
282
    "paddd                     %%mm0, %%mm4         \n\t"\
283
    "paddd                     %%mm3, %%mm5         \n\t"\
284
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
286
    "add                         $16, %%"REG_d"     \n\t"\
287
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
288
    "movq                      %%mm2, %%mm0         \n\t"\
289
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
290
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
291
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
292
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
293
    "paddd                     %%mm2, %%mm6         \n\t"\
294
    "paddd                     %%mm0, %%mm7         \n\t"\
295
    " jnz                         2b                \n\t"\
296
    "psrad                       $16, %%mm4         \n\t"\
297
    "psrad                       $16, %%mm5         \n\t"\
298
    "psrad                       $16, %%mm6         \n\t"\
299
    "psrad                       $16, %%mm7         \n\t"\
300
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301
    "packssdw                  %%mm5, %%mm4         \n\t"\
302
    "packssdw                  %%mm7, %%mm6         \n\t"\
303
    "paddw                     %%mm0, %%mm4         \n\t"\
304
    "paddw                     %%mm0, %%mm6         \n\t"\
305
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307
\
308
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310
    "pxor                      %%mm1, %%mm1         \n\t"\
311
    "pxor                      %%mm5, %%mm5         \n\t"\
312
    "pxor                      %%mm7, %%mm7         \n\t"\
313
    "pxor                      %%mm6, %%mm6         \n\t"\
314
    ASMALIGN(4)\
315
    "2:                                             \n\t"\
316
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
319
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320
    "movq                      %%mm0, %%mm3         \n\t"\
321
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
322
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
323
    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
325
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
326
    "paddd                     %%mm0, %%mm1         \n\t"\
327
    "paddd                     %%mm3, %%mm5         \n\t"\
328
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
330
    "add                         $16, %%"REG_d"     \n\t"\
331
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
332
    "movq                      %%mm2, %%mm0         \n\t"\
333
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
334
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
335
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
336
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
337
    "paddd                     %%mm2, %%mm7         \n\t"\
338
    "paddd                     %%mm0, %%mm6         \n\t"\
339
    " jnz                         2b                \n\t"\
340
    "psrad                       $16, %%mm1         \n\t"\
341
    "psrad                       $16, %%mm5         \n\t"\
342
    "psrad                       $16, %%mm7         \n\t"\
343
    "psrad                       $16, %%mm6         \n\t"\
344
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345
    "packssdw                  %%mm5, %%mm1         \n\t"\
346
    "packssdw                  %%mm6, %%mm7         \n\t"\
347
    "paddw                     %%mm0, %%mm1         \n\t"\
348
    "paddw                     %%mm0, %%mm7         \n\t"\
349
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351

    
352
#define YSCALEYUV2RGBX \
353
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367
    "paddw           %%mm3, %%mm4       \n\t"\
368
    "movq            %%mm2, %%mm0       \n\t"\
369
    "movq            %%mm5, %%mm6       \n\t"\
370
    "movq            %%mm4, %%mm3       \n\t"\
371
    "punpcklwd       %%mm2, %%mm2       \n\t"\
372
    "punpcklwd       %%mm5, %%mm5       \n\t"\
373
    "punpcklwd       %%mm4, %%mm4       \n\t"\
374
    "paddw           %%mm1, %%mm2       \n\t"\
375
    "paddw           %%mm1, %%mm5       \n\t"\
376
    "paddw           %%mm1, %%mm4       \n\t"\
377
    "punpckhwd       %%mm0, %%mm0       \n\t"\
378
    "punpckhwd       %%mm6, %%mm6       \n\t"\
379
    "punpckhwd       %%mm3, %%mm3       \n\t"\
380
    "paddw           %%mm7, %%mm0       \n\t"\
381
    "paddw           %%mm7, %%mm6       \n\t"\
382
    "paddw           %%mm7, %%mm3       \n\t"\
383
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384
    "packuswb        %%mm0, %%mm2       \n\t"\
385
    "packuswb        %%mm6, %%mm5       \n\t"\
386
    "packuswb        %%mm3, %%mm4       \n\t"\
387
    "pxor            %%mm7, %%mm7       \n\t"
388
#if 0
389
#define FULL_YSCALEYUV2RGB \
390
    "pxor                 %%mm7, %%mm7  \n\t"\
391
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392
    "punpcklwd            %%mm6, %%mm6  \n\t"\
393
    "punpcklwd            %%mm6, %%mm6  \n\t"\
394
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395
    "punpcklwd            %%mm5, %%mm5  \n\t"\
396
    "punpcklwd            %%mm5, %%mm5  \n\t"\
397
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
398
    ASMALIGN(4)\
399
    "1:                                 \n\t"\
400
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418
\
419
\
420
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427
\
428
\
429
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434
    "packuswb             %%mm3, %%mm3  \n\t"\
435
\
436
    "packuswb             %%mm0, %%mm0  \n\t"\
437
    "paddw                %%mm4, %%mm2  \n\t"\
438
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439
\
440
    "packuswb             %%mm1, %%mm1  \n\t"
441
#endif
442

    
443
#define REAL_YSCALEYUV2PACKED(index, c) \
444
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
445
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
446
    "psraw                $3, %%mm0                           \n\t"\
447
    "psraw                $3, %%mm1                           \n\t"\
448
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450
    "xor            "#index", "#index"                        \n\t"\
451
    ASMALIGN(4)\
452
    "1:                                 \n\t"\
453
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
454
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
455
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
456
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
457
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
460
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
467
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
468
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
469
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
470
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
471
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
472
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478

    
479
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
480

    
481
#define REAL_YSCALEYUV2RGB(index, c) \
482
    "xor            "#index", "#index"  \n\t"\
483
    ASMALIGN(4)\
484
    "1:                                 \n\t"\
485
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
486
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
487
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
488
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
489
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
492
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
499
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
500
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
501
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
502
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
503
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
504
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
506
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
507
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
508
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
509
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
510
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
511
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
518
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
519
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
520
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
521
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
522
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
523
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524
    "paddw             %%mm3, %%mm4     \n\t"\
525
    "movq              %%mm2, %%mm0     \n\t"\
526
    "movq              %%mm5, %%mm6     \n\t"\
527
    "movq              %%mm4, %%mm3     \n\t"\
528
    "punpcklwd         %%mm2, %%mm2     \n\t"\
529
    "punpcklwd         %%mm5, %%mm5     \n\t"\
530
    "punpcklwd         %%mm4, %%mm4     \n\t"\
531
    "paddw             %%mm1, %%mm2     \n\t"\
532
    "paddw             %%mm1, %%mm5     \n\t"\
533
    "paddw             %%mm1, %%mm4     \n\t"\
534
    "punpckhwd         %%mm0, %%mm0     \n\t"\
535
    "punpckhwd         %%mm6, %%mm6     \n\t"\
536
    "punpckhwd         %%mm3, %%mm3     \n\t"\
537
    "paddw             %%mm7, %%mm0     \n\t"\
538
    "paddw             %%mm7, %%mm6     \n\t"\
539
    "paddw             %%mm7, %%mm3     \n\t"\
540
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541
    "packuswb          %%mm0, %%mm2     \n\t"\
542
    "packuswb          %%mm6, %%mm5     \n\t"\
543
    "packuswb          %%mm3, %%mm4     \n\t"\
544
    "pxor              %%mm7, %%mm7     \n\t"
545
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
546

    
547
#define REAL_YSCALEYUV2PACKED1(index, c) \
548
    "xor            "#index", "#index"  \n\t"\
549
    ASMALIGN(4)\
550
    "1:                                 \n\t"\
551
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
552
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
553
    "psraw                $7, %%mm3     \n\t" \
554
    "psraw                $7, %%mm4     \n\t" \
555
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
556
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
557
    "psraw                $7, %%mm1     \n\t" \
558
    "psraw                $7, %%mm7     \n\t" \
559

    
560
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
561

    
562
#define REAL_YSCALEYUV2RGB1(index, c) \
563
    "xor            "#index", "#index"  \n\t"\
564
    ASMALIGN(4)\
565
    "1:                                 \n\t"\
566
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
568
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
571
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
572
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
573
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
574
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
575
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
576
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
578
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
579
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
582
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
583
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
584
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
585
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
586
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
587
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588
    "paddw             %%mm3, %%mm4     \n\t"\
589
    "movq              %%mm2, %%mm0     \n\t"\
590
    "movq              %%mm5, %%mm6     \n\t"\
591
    "movq              %%mm4, %%mm3     \n\t"\
592
    "punpcklwd         %%mm2, %%mm2     \n\t"\
593
    "punpcklwd         %%mm5, %%mm5     \n\t"\
594
    "punpcklwd         %%mm4, %%mm4     \n\t"\
595
    "paddw             %%mm1, %%mm2     \n\t"\
596
    "paddw             %%mm1, %%mm5     \n\t"\
597
    "paddw             %%mm1, %%mm4     \n\t"\
598
    "punpckhwd         %%mm0, %%mm0     \n\t"\
599
    "punpckhwd         %%mm6, %%mm6     \n\t"\
600
    "punpckhwd         %%mm3, %%mm3     \n\t"\
601
    "paddw             %%mm7, %%mm0     \n\t"\
602
    "paddw             %%mm7, %%mm6     \n\t"\
603
    "paddw             %%mm7, %%mm3     \n\t"\
604
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605
    "packuswb          %%mm0, %%mm2     \n\t"\
606
    "packuswb          %%mm6, %%mm5     \n\t"\
607
    "packuswb          %%mm3, %%mm4     \n\t"\
608
    "pxor              %%mm7, %%mm7     \n\t"
609
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
610

    
611
#define REAL_YSCALEYUV2PACKED1b(index, c) \
612
    "xor "#index", "#index"             \n\t"\
613
    ASMALIGN(4)\
614
    "1:                                 \n\t"\
615
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
616
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
617
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
618
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
619
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621
    "psrlw                $8, %%mm3     \n\t" \
622
    "psrlw                $8, %%mm4     \n\t" \
623
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
624
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
625
    "psraw                $7, %%mm1     \n\t" \
626
    "psraw                $7, %%mm7     \n\t"
627
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
628

    
629
// do vertical chrominance interpolation
630
#define REAL_YSCALEYUV2RGB1b(index, c) \
631
    "xor            "#index", "#index"  \n\t"\
632
    ASMALIGN(4)\
633
    "1:                                 \n\t"\
634
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
635
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
636
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
637
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
638
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
641
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
642
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
643
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
644
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
645
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
646
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
647
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
648
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
650
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
651
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
654
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
655
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
656
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
657
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
658
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
659
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660
    "paddw             %%mm3, %%mm4     \n\t"\
661
    "movq              %%mm2, %%mm0     \n\t"\
662
    "movq              %%mm5, %%mm6     \n\t"\
663
    "movq              %%mm4, %%mm3     \n\t"\
664
    "punpcklwd         %%mm2, %%mm2     \n\t"\
665
    "punpcklwd         %%mm5, %%mm5     \n\t"\
666
    "punpcklwd         %%mm4, %%mm4     \n\t"\
667
    "paddw             %%mm1, %%mm2     \n\t"\
668
    "paddw             %%mm1, %%mm5     \n\t"\
669
    "paddw             %%mm1, %%mm4     \n\t"\
670
    "punpckhwd         %%mm0, %%mm0     \n\t"\
671
    "punpckhwd         %%mm6, %%mm6     \n\t"\
672
    "punpckhwd         %%mm3, %%mm3     \n\t"\
673
    "paddw             %%mm7, %%mm0     \n\t"\
674
    "paddw             %%mm7, %%mm6     \n\t"\
675
    "paddw             %%mm7, %%mm3     \n\t"\
676
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677
    "packuswb          %%mm0, %%mm2     \n\t"\
678
    "packuswb          %%mm6, %%mm5     \n\t"\
679
    "packuswb          %%mm3, %%mm4     \n\t"\
680
    "pxor              %%mm7, %%mm7     \n\t"
681
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
682

    
683
#define REAL_WRITEBGR32(dst, dstw, index) \
684
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685
    "movq      %%mm2, %%mm1     \n\t" /* B */\
686
    "movq      %%mm5, %%mm6     \n\t" /* R */\
687
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
688
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
689
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
690
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
691
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
692
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
693
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
694
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
695
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
696
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
697
\
698
    MOVNTQ(%%mm0,   (dst, index, 4))\
699
    MOVNTQ(%%mm2,  8(dst, index, 4))\
700
    MOVNTQ(%%mm1, 16(dst, index, 4))\
701
    MOVNTQ(%%mm3, 24(dst, index, 4))\
702
\
703
    "add      $8, "#index"      \n\t"\
704
    "cmp "#dstw", "#index"      \n\t"\
705
    " jb      1b                \n\t"
706
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
707

    
708
#define REAL_WRITERGB16(dst, dstw, index) \
709
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
710
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
711
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
712
    "psrlq           $3, %%mm2  \n\t"\
713
\
714
    "movq         %%mm2, %%mm1  \n\t"\
715
    "movq         %%mm4, %%mm3  \n\t"\
716
\
717
    "punpcklbw    %%mm7, %%mm3  \n\t"\
718
    "punpcklbw    %%mm5, %%mm2  \n\t"\
719
    "punpckhbw    %%mm7, %%mm4  \n\t"\
720
    "punpckhbw    %%mm5, %%mm1  \n\t"\
721
\
722
    "psllq           $3, %%mm3  \n\t"\
723
    "psllq           $3, %%mm4  \n\t"\
724
\
725
    "por          %%mm3, %%mm2  \n\t"\
726
    "por          %%mm4, %%mm1  \n\t"\
727
\
728
    MOVNTQ(%%mm2,  (dst, index, 2))\
729
    MOVNTQ(%%mm1, 8(dst, index, 2))\
730
\
731
    "add             $8, "#index"   \n\t"\
732
    "cmp        "#dstw", "#index"   \n\t"\
733
    " jb             1b             \n\t"
734
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
735

    
736
#define REAL_WRITERGB15(dst, dstw, index) \
737
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
738
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
739
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
740
    "psrlq           $3, %%mm2  \n\t"\
741
    "psrlq           $1, %%mm5  \n\t"\
742
\
743
    "movq         %%mm2, %%mm1  \n\t"\
744
    "movq         %%mm4, %%mm3  \n\t"\
745
\
746
    "punpcklbw    %%mm7, %%mm3  \n\t"\
747
    "punpcklbw    %%mm5, %%mm2  \n\t"\
748
    "punpckhbw    %%mm7, %%mm4  \n\t"\
749
    "punpckhbw    %%mm5, %%mm1  \n\t"\
750
\
751
    "psllq           $2, %%mm3  \n\t"\
752
    "psllq           $2, %%mm4  \n\t"\
753
\
754
    "por          %%mm3, %%mm2  \n\t"\
755
    "por          %%mm4, %%mm1  \n\t"\
756
\
757
    MOVNTQ(%%mm2,  (dst, index, 2))\
758
    MOVNTQ(%%mm1, 8(dst, index, 2))\
759
\
760
    "add             $8, "#index"   \n\t"\
761
    "cmp        "#dstw", "#index"   \n\t"\
762
    " jb             1b             \n\t"
763
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
764

    
765
#define WRITEBGR24OLD(dst, dstw, index) \
766
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767
    "movq      %%mm2, %%mm1             \n\t" /* B */\
768
    "movq      %%mm5, %%mm6             \n\t" /* R */\
769
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
770
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
771
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
772
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
773
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
774
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
775
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
776
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
777
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
778
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
779
\
780
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
781
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
782
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
783
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
784
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
785
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
786
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
787
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
788
\
789
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
790
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
791
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
792
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
793
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
794
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
795
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
796
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
797
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
798
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
799
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
800
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
801
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
802
\
803
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
804
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
805
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
806
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
807
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
808
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
809
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
810
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
811
\
812
    MOVNTQ(%%mm0,   (dst))\
813
    MOVNTQ(%%mm2,  8(dst))\
814
    MOVNTQ(%%mm3, 16(dst))\
815
    "add         $24, "#dst"            \n\t"\
816
\
817
    "add          $8, "#index"          \n\t"\
818
    "cmp     "#dstw", "#index"          \n\t"\
819
    " jb          1b                    \n\t"
820

    
821
#define WRITEBGR24MMX(dst, dstw, index) \
822
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
    "movq      %%mm2, %%mm1     \n\t" /* B */\
824
    "movq      %%mm5, %%mm6     \n\t" /* R */\
825
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
826
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
827
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
828
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
829
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
830
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
831
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
832
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
833
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
834
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
835
\
836
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
837
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
838
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
839
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
840
\
841
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
842
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
843
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
844
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
845
\
846
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
847
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
848
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
849
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
850
\
851
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
852
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
853
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
854
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
855
    MOVNTQ(%%mm0, (dst))\
856
\
857
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
858
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
859
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
860
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
861
    MOVNTQ(%%mm6, 8(dst))\
862
\
863
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
864
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
865
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
866
    MOVNTQ(%%mm5, 16(dst))\
867
\
868
    "add         $24, "#dst"    \n\t"\
869
\
870
    "add          $8, "#index"  \n\t"\
871
    "cmp     "#dstw", "#index"  \n\t"\
872
    " jb          1b            \n\t"
873

    
874
#define WRITEBGR24MMX2(dst, dstw, index) \
875
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
879
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
880
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
881
\
882
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
883
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
884
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
885
\
886
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
887
    "por    %%mm1, %%mm6        \n\t"\
888
    "por    %%mm3, %%mm6        \n\t"\
889
    MOVNTQ(%%mm6, (dst))\
890
\
891
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
892
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
893
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
894
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
895
\
896
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
897
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
898
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
899
\
900
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
901
    "por    %%mm3, %%mm6        \n\t"\
902
    MOVNTQ(%%mm6, 8(dst))\
903
\
904
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
905
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
906
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
907
\
908
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
909
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
910
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
911
\
912
    "por    %%mm1, %%mm3        \n\t"\
913
    "por    %%mm3, %%mm6        \n\t"\
914
    MOVNTQ(%%mm6, 16(dst))\
915
\
916
    "add      $24, "#dst"       \n\t"\
917
\
918
    "add       $8, "#index"     \n\t"\
919
    "cmp  "#dstw", "#index"     \n\t"\
920
    " jb       1b               \n\t"
921

    
922
#ifdef HAVE_MMX2
923
#undef WRITEBGR24
924
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
925
#else
926
#undef WRITEBGR24
927
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
928
#endif
929

    
930
#define REAL_WRITEYUY2(dst, dstw, index) \
931
    "packuswb  %%mm3, %%mm3     \n\t"\
932
    "packuswb  %%mm4, %%mm4     \n\t"\
933
    "packuswb  %%mm7, %%mm1     \n\t"\
934
    "punpcklbw %%mm4, %%mm3     \n\t"\
935
    "movq      %%mm1, %%mm7     \n\t"\
936
    "punpcklbw %%mm3, %%mm1     \n\t"\
937
    "punpckhbw %%mm3, %%mm7     \n\t"\
938
\
939
    MOVNTQ(%%mm1, (dst, index, 2))\
940
    MOVNTQ(%%mm7, 8(dst, index, 2))\
941
\
942
    "add          $8, "#index"  \n\t"\
943
    "cmp     "#dstw", "#index"  \n\t"\
944
    " jb          1b            \n\t"
945
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
946

    
947

    
948
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951
{
952
#ifdef HAVE_MMX
953
    if (c->flags & SWS_ACCURATE_RND){
954
        if (uDest){
955
            YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956
            YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
957
        }
958

    
959
        YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
960
    }else{
961
        if (uDest){
962
            YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963
            YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
964
        }
965

    
966
        YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
967
    }
968
#else
969
#ifdef HAVE_ALTIVEC
970
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971
                      chrFilter, chrSrc, chrFilterSize,
972
                      dest, uDest, vDest, dstW, chrDstW);
973
#else //HAVE_ALTIVEC
974
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975
            chrFilter, chrSrc, chrFilterSize,
976
            dest, uDest, vDest, dstW, chrDstW);
977
#endif //!HAVE_ALTIVEC
978
#endif /* HAVE_MMX */
979
}
980

    
981
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
984
{
985
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986
             chrFilter, chrSrc, chrFilterSize,
987
             dest, uDest, dstW, chrDstW, dstFormat);
988
}
989

    
990
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
992
{
993
#ifdef HAVE_MMX
994
    long p= uDest ? 3 : 1;
995
    uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996
    uint8_t *dst[3]= {dest, uDest, vDest};
997
    long counter[3] = {dstW, chrDstW, chrDstW};
998

    
999
    if (c->flags & SWS_ACCURATE_RND){
1000
        while(p--){
1001
            asm volatile(
1002
                YSCALEYUV2YV121_ACCURATE
1003
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1004
                "g" (-counter[p])
1005
                : "%"REG_a
1006
            );
1007
        }
1008
    }else{
1009
        while(p--){
1010
            asm volatile(
1011
                YSCALEYUV2YV121
1012
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1013
                "g" (-counter[p])
1014
                : "%"REG_a
1015
            );
1016
        }
1017
    }
1018

    
1019
#else
1020
    int i;
1021
    for (i=0; i<dstW; i++)
1022
    {
1023
        int val= (lumSrc[i]+64)>>7;
1024

    
1025
        if (val&256){
1026
            if (val<0) val=0;
1027
            else       val=255;
1028
        }
1029

    
1030
        dest[i]= val;
1031
    }
1032

    
1033
    if (uDest)
1034
        for (i=0; i<chrDstW; i++)
1035
        {
1036
            int u=(chrSrc[i       ]+64)>>7;
1037
            int v=(chrSrc[i + VOFW]+64)>>7;
1038

    
1039
            if ((u|v)&256){
1040
                if (u<0)        u=0;
1041
                else if (u>255) u=255;
1042
                if (v<0)        v=0;
1043
                else if (v>255) v=255;
1044
            }
1045

    
1046
            uDest[i]= u;
1047
            vDest[i]= v;
1048
        }
1049
#endif
1050
}
1051

    
1052

    
1053
/**
1054
 * vertical scale YV12 to RGB
1055
 */
1056
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058
                                       uint8_t *dest, long dstW, long dstY)
1059
{
1060
#ifdef HAVE_MMX
1061
    long dummy=0;
1062
    if (c->flags & SWS_ACCURATE_RND){
1063
        switch(c->dstFormat){
1064
        case PIX_FMT_RGB32:
1065
            YSCALEYUV2PACKEDX_ACCURATE
1066
            YSCALEYUV2RGBX
1067
            WRITEBGR32(%4, %5, %%REGa)
1068

    
1069
            YSCALEYUV2PACKEDX_END
1070
            return;
1071
        case PIX_FMT_BGR24:
1072
            YSCALEYUV2PACKEDX_ACCURATE
1073
            YSCALEYUV2RGBX
1074
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075
            "add %4, %%"REG_c"                        \n\t"
1076
            WRITEBGR24(%%REGc, %5, %%REGa)
1077

    
1078

    
1079
            :: "r" (&c->redDither),
1080
               "m" (dummy), "m" (dummy), "m" (dummy),
1081
               "r" (dest), "m" (dstW)
1082
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1083
            );
1084
            return;
1085
        case PIX_FMT_RGB555:
1086
            YSCALEYUV2PACKEDX_ACCURATE
1087
            YSCALEYUV2RGBX
1088
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089
#ifdef DITHER1XBPP
1090
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1093
#endif
1094

    
1095
            WRITERGB15(%4, %5, %%REGa)
1096
            YSCALEYUV2PACKEDX_END
1097
            return;
1098
        case PIX_FMT_RGB565:
1099
            YSCALEYUV2PACKEDX_ACCURATE
1100
            YSCALEYUV2RGBX
1101
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102
#ifdef DITHER1XBPP
1103
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1106
#endif
1107

    
1108
            WRITERGB16(%4, %5, %%REGa)
1109
            YSCALEYUV2PACKEDX_END
1110
            return;
1111
        case PIX_FMT_YUYV422:
1112
            YSCALEYUV2PACKEDX_ACCURATE
1113
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114

    
1115
            "psraw $3, %%mm3    \n\t"
1116
            "psraw $3, %%mm4    \n\t"
1117
            "psraw $3, %%mm1    \n\t"
1118
            "psraw $3, %%mm7    \n\t"
1119
            WRITEYUY2(%4, %5, %%REGa)
1120
            YSCALEYUV2PACKEDX_END
1121
            return;
1122
    }
1123
    }else{
1124
        switch(c->dstFormat)
1125
        {
1126
        case PIX_FMT_RGB32:
1127
            YSCALEYUV2PACKEDX
1128
            YSCALEYUV2RGBX
1129
            WRITEBGR32(%4, %5, %%REGa)
1130
            YSCALEYUV2PACKEDX_END
1131
            return;
1132
        case PIX_FMT_BGR24:
1133
            YSCALEYUV2PACKEDX
1134
            YSCALEYUV2RGBX
1135
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
            "add                        %4, %%"REG_c"   \n\t"
1137
            WRITEBGR24(%%REGc, %5, %%REGa)
1138

    
1139
            :: "r" (&c->redDither),
1140
               "m" (dummy), "m" (dummy), "m" (dummy),
1141
               "r" (dest),  "m" (dstW)
1142
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
            );
1144
            return;
1145
        case PIX_FMT_RGB555:
1146
            YSCALEYUV2PACKEDX
1147
            YSCALEYUV2RGBX
1148
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
#ifdef DITHER1XBPP
1150
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1151
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1152
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1153
#endif
1154

    
1155
            WRITERGB15(%4, %5, %%REGa)
1156
            YSCALEYUV2PACKEDX_END
1157
            return;
1158
        case PIX_FMT_RGB565:
1159
            YSCALEYUV2PACKEDX
1160
            YSCALEYUV2RGBX
1161
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1162
#ifdef DITHER1XBPP
1163
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1164
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1165
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1166
#endif
1167

    
1168
            WRITERGB16(%4, %5, %%REGa)
1169
            YSCALEYUV2PACKEDX_END
1170
            return;
1171
        case PIX_FMT_YUYV422:
1172
            YSCALEYUV2PACKEDX
1173
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174

    
1175
            "psraw $3, %%mm3    \n\t"
1176
            "psraw $3, %%mm4    \n\t"
1177
            "psraw $3, %%mm1    \n\t"
1178
            "psraw $3, %%mm7    \n\t"
1179
            WRITEYUY2(%4, %5, %%REGa)
1180
            YSCALEYUV2PACKEDX_END
1181
            return;
1182
        }
1183
    }
1184
#endif /* HAVE_MMX */
1185
#ifdef HAVE_ALTIVEC
1186
    /* The following list of supported dstFormat values should
1187
       match what's found in the body of altivec_yuv2packedX() */
1188
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1189
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1191
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192
                                 chrFilter, chrSrc, chrFilterSize,
1193
                                 dest, dstW, dstY);
1194
    else
1195
#endif
1196
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197
                       chrFilter, chrSrc, chrFilterSize,
1198
                       dest, dstW, dstY);
1199
}
1200

    
1201
/**
1202
 * vertical bilinear scale YV12 to RGB
1203
 */
1204
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1206
{
1207
    int yalpha1=yalpha^4095;
1208
    int uvalpha1=uvalpha^4095;
1209
    int i;
1210

    
1211
#if 0 //isn't used
1212
    if (flags&SWS_FULL_CHR_H_INT)
1213
    {
1214
        switch(dstFormat)
1215
        {
1216
#ifdef HAVE_MMX
1217
        case PIX_FMT_RGB32:
1218
            asm volatile(
1219

1220

1221
FULL_YSCALEYUV2RGB
1222
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1223
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1224

1225
            "movq      %%mm3, %%mm1    \n\t"
1226
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1227
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1228

1229
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1230
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1231

1232
            "add $4, %%"REG_a"  \n\t"
1233
            "cmp %5, %%"REG_a"  \n\t"
1234
            " jb 1b             \n\t"
1235

1236
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237
            "m" (yalpha1), "m" (uvalpha1)
1238
            : "%"REG_a
1239
            );
1240
            break;
1241
        case PIX_FMT_BGR24:
1242
            asm volatile(
1243

1244
FULL_YSCALEYUV2RGB
1245

1246
                                              // lsb ... msb
1247
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1248
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1249

1250
            "movq      %%mm3, %%mm1     \n\t"
1251
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1252
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1253

1254
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1255
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1256
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1257
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1258
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1259
            "movq      %%mm1, %%mm2     \n\t"
1260
            "psllq       $48, %%mm1     \n\t" // 000000BG
1261
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1262

1263
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1264
            "psrld       $16, %%mm2     \n\t" // R000R000
1265
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1266
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1267

1268
            "mov          %4, %%"REG_b" \n\t"
1269
            "add   %%"REG_a", %%"REG_b" \n\t"
1270

1271
#ifdef HAVE_MMX2
1272
            //FIXME Alignment
1273
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1274
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1275
#else
1276
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1277
            "psrlq  $32, %%mm3                          \n\t"
1278
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1279
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1280
#endif
1281
            "add     $4, %%"REG_a"                      \n\t"
1282
            "cmp     %5, %%"REG_a"                      \n\t"
1283
            " jb     1b                                 \n\t"
1284

    
1285
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286
            "m" (yalpha1), "m" (uvalpha1)
1287
            : "%"REG_a, "%"REG_b
1288
            );
1289
            break;
1290
        case PIX_FMT_BGR555:
1291
            asm volatile(
1292

    
1293
FULL_YSCALEYUV2RGB
1294
#ifdef DITHER1XBPP
1295
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1296
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1297
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1298
#endif
1299
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1300
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1301
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1302

    
1303
            "psrlw                   $3, %%mm3  \n\t"
1304
            "psllw                   $2, %%mm1  \n\t"
1305
            "psllw                   $7, %%mm0  \n\t"
1306
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1307
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1308

    
1309
            "por                  %%mm3, %%mm1  \n\t"
1310
            "por                  %%mm1, %%mm0  \n\t"
1311

    
1312
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1313

    
1314
            "add $4, %%"REG_a"  \n\t"
1315
            "cmp %5, %%"REG_a"  \n\t"
1316
            " jb 1b             \n\t"
1317

    
1318
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319
            "m" (yalpha1), "m" (uvalpha1)
1320
            : "%"REG_a
1321
            );
1322
            break;
1323
        case PIX_FMT_BGR565:
1324
            asm volatile(
1325

    
1326
FULL_YSCALEYUV2RGB
1327
#ifdef DITHER1XBPP
1328
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1329
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1330
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1331
#endif
1332
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1333
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1334
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1335

    
1336
            "psrlw                   $3, %%mm3  \n\t"
1337
            "psllw                   $3, %%mm1  \n\t"
1338
            "psllw                   $8, %%mm0  \n\t"
1339
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1340
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1341

    
1342
            "por                  %%mm3, %%mm1  \n\t"
1343
            "por                  %%mm1, %%mm0  \n\t"
1344

    
1345
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1346

    
1347
            "add $4, %%"REG_a"  \n\t"
1348
            "cmp %5, %%"REG_a"  \n\t"
1349
            " jb 1b             \n\t"
1350

    
1351
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352
            "m" (yalpha1), "m" (uvalpha1)
1353
            : "%"REG_a
1354
            );
1355
            break;
1356
#endif /* HAVE_MMX */
1357
        case PIX_FMT_BGR32:
1358
#ifndef HAVE_MMX
1359
        case PIX_FMT_RGB32:
1360
#endif
1361
            if (dstFormat==PIX_FMT_RGB32)
1362
            {
1363
                int i;
1364
#ifdef WORDS_BIGENDIAN
1365
                dest++;
1366
#endif
1367
                for (i=0;i<dstW;i++){
1368
                    // vertical linear interpolation && yuv2rgb in a single step:
1369
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1375
                    dest+= 4;
1376
                }
1377
            }
1378
            else if (dstFormat==PIX_FMT_BGR24)
1379
            {
1380
                int i;
1381
                for (i=0;i<dstW;i++){
1382
                    // vertical linear interpolation && yuv2rgb in a single step:
1383
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1389
                    dest+= 3;
1390
                }
1391
            }
1392
            else if (dstFormat==PIX_FMT_BGR565)
1393
            {
1394
                int i;
1395
                for (i=0;i<dstW;i++){
1396
                    // vertical linear interpolation && yuv2rgb in a single step:
1397
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1400

    
1401
                    ((uint16_t*)dest)[i] =
1402
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1405
                }
1406
            }
1407
            else if (dstFormat==PIX_FMT_BGR555)
1408
            {
1409
                int i;
1410
                for (i=0;i<dstW;i++){
1411
                    // vertical linear interpolation && yuv2rgb in a single step:
1412
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1415

    
1416
                    ((uint16_t*)dest)[i] =
1417
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1420
                }
1421
            }
1422
        }//FULL_UV_IPOL
1423
    else
1424
    {
1425
#endif // if 0
1426
#ifdef HAVE_MMX
1427
        switch(c->dstFormat)
1428
        {
1429
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1430
            case PIX_FMT_RGB32:
1431
                asm volatile(
1432
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1433
                "mov        %4, %%"REG_b"               \n\t"
1434
                "push %%"REG_BP"                        \n\t"
1435
                YSCALEYUV2RGB(%%REGBP, %5)
1436
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437
                "pop %%"REG_BP"                         \n\t"
1438
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1439

    
1440
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441
                "a" (&c->redDither)
1442
                );
1443
                return;
1444
            case PIX_FMT_BGR24:
1445
                asm volatile(
1446
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1447
                "mov        %4, %%"REG_b"               \n\t"
1448
                "push %%"REG_BP"                        \n\t"
1449
                YSCALEYUV2RGB(%%REGBP, %5)
1450
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451
                "pop %%"REG_BP"                         \n\t"
1452
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1453
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1454
                "a" (&c->redDither)
1455
                );
1456
                return;
1457
            case PIX_FMT_RGB555:
1458
                asm volatile(
1459
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1460
                "mov        %4, %%"REG_b"               \n\t"
1461
                "push %%"REG_BP"                        \n\t"
1462
                YSCALEYUV2RGB(%%REGBP, %5)
1463
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464
#ifdef DITHER1XBPP
1465
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1466
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1467
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1468
#endif
1469

    
1470
                WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1471
                "pop %%"REG_BP"                         \n\t"
1472
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473

    
1474
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                "a" (&c->redDither)
1476
                );
1477
                return;
1478
            case PIX_FMT_RGB565:
1479
                asm volatile(
1480
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1481
                "mov        %4, %%"REG_b"               \n\t"
1482
                "push %%"REG_BP"                        \n\t"
1483
                YSCALEYUV2RGB(%%REGBP, %5)
1484
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1485
#ifdef DITHER1XBPP
1486
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1487
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1488
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1489
#endif
1490

    
1491
                WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1492
                "pop %%"REG_BP"                         \n\t"
1493
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1494
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495
                "a" (&c->redDither)
1496
                );
1497
                return;
1498
            case PIX_FMT_YUYV422:
1499
                asm volatile(
1500
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                "mov %4, %%"REG_b"                        \n\t"
1502
                "push %%"REG_BP"                        \n\t"
1503
                YSCALEYUV2PACKED(%%REGBP, %5)
1504
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505
                "pop %%"REG_BP"                         \n\t"
1506
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1507
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508
                "a" (&c->redDither)
1509
                );
1510
                return;
1511
            default: break;
1512
        }
1513
#endif //HAVE_MMX
1514
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1515
}
1516

    
1517
/**
1518
 * YV12 to RGB without scaling or interpolating
1519
 */
1520
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1522
{
1523
    const int yalpha1=0;
1524
    int i;
1525

    
1526
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527
    const int yalpha= 4096; //FIXME ...
1528

    
1529
    if (flags&SWS_FULL_CHR_H_INT)
1530
    {
1531
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1532
        return;
1533
    }
1534

    
1535
#ifdef HAVE_MMX
1536
    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1537
    {
1538
        switch(dstFormat)
1539
        {
1540
        case PIX_FMT_RGB32:
1541
            asm volatile(
1542
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1543
            "mov        %4, %%"REG_b"               \n\t"
1544
            "push %%"REG_BP"                        \n\t"
1545
            YSCALEYUV2RGB1(%%REGBP, %5)
1546
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547
            "pop %%"REG_BP"                         \n\t"
1548
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1549

    
1550
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551
            "a" (&c->redDither)
1552
            );
1553
            return;
1554
        case PIX_FMT_BGR24:
1555
            asm volatile(
1556
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1557
            "mov        %4, %%"REG_b"               \n\t"
1558
            "push %%"REG_BP"                        \n\t"
1559
            YSCALEYUV2RGB1(%%REGBP, %5)
1560
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561
            "pop %%"REG_BP"                         \n\t"
1562
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1563

    
1564
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565
            "a" (&c->redDither)
1566
            );
1567
            return;
1568
        case PIX_FMT_RGB555:
1569
            asm volatile(
1570
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1571
            "mov        %4, %%"REG_b"               \n\t"
1572
            "push %%"REG_BP"                        \n\t"
1573
            YSCALEYUV2RGB1(%%REGBP, %5)
1574
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575
#ifdef DITHER1XBPP
1576
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1577
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1578
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1579
#endif
1580
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1581
            "pop %%"REG_BP"                         \n\t"
1582
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1583

    
1584
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1585
            "a" (&c->redDither)
1586
            );
1587
            return;
1588
        case PIX_FMT_RGB565:
1589
            asm volatile(
1590
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1591
            "mov        %4, %%"REG_b"               \n\t"
1592
            "push %%"REG_BP"                        \n\t"
1593
            YSCALEYUV2RGB1(%%REGBP, %5)
1594
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1595
#ifdef DITHER1XBPP
1596
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1597
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1598
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1599
#endif
1600

    
1601
            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1602
            "pop %%"REG_BP"                         \n\t"
1603
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1604

    
1605
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1606
            "a" (&c->redDither)
1607
            );
1608
            return;
1609
        case PIX_FMT_YUYV422:
1610
            asm volatile(
1611
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1612
            "mov        %4, %%"REG_b"               \n\t"
1613
            "push %%"REG_BP"                        \n\t"
1614
            YSCALEYUV2PACKED1(%%REGBP, %5)
1615
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616
            "pop %%"REG_BP"                         \n\t"
1617
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1618

    
1619
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620
            "a" (&c->redDither)
1621
            );
1622
            return;
1623
        }
1624
    }
1625
    else
1626
    {
1627
        switch(dstFormat)
1628
        {
1629
        case PIX_FMT_RGB32:
1630
            asm volatile(
1631
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1632
            "mov        %4, %%"REG_b"               \n\t"
1633
            "push %%"REG_BP"                        \n\t"
1634
            YSCALEYUV2RGB1b(%%REGBP, %5)
1635
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636
            "pop %%"REG_BP"                         \n\t"
1637
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1638

    
1639
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1640
            "a" (&c->redDither)
1641
            );
1642
            return;
1643
        case PIX_FMT_BGR24:
1644
            asm volatile(
1645
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1646
            "mov        %4, %%"REG_b"               \n\t"
1647
            "push %%"REG_BP"                        \n\t"
1648
            YSCALEYUV2RGB1b(%%REGBP, %5)
1649
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650
            "pop %%"REG_BP"                         \n\t"
1651
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1652

    
1653
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654
            "a" (&c->redDither)
1655
            );
1656
            return;
1657
        case PIX_FMT_RGB555:
1658
            asm volatile(
1659
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1660
            "mov        %4, %%"REG_b"               \n\t"
1661
            "push %%"REG_BP"                        \n\t"
1662
            YSCALEYUV2RGB1b(%%REGBP, %5)
1663
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664
#ifdef DITHER1XBPP
1665
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1666
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1667
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1668
#endif
1669
            WRITERGB15(%%REGb, 8280(%5), %%REGBP)
1670
            "pop %%"REG_BP"                         \n\t"
1671
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1672

    
1673
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1674
            "a" (&c->redDither)
1675
            );
1676
            return;
1677
        case PIX_FMT_RGB565:
1678
            asm volatile(
1679
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1680
            "mov        %4, %%"REG_b"               \n\t"
1681
            "push %%"REG_BP"                        \n\t"
1682
            YSCALEYUV2RGB1b(%%REGBP, %5)
1683
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1684
#ifdef DITHER1XBPP
1685
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1686
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1687
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1688
#endif
1689

    
1690
            WRITERGB16(%%REGb, 8280(%5), %%REGBP)
1691
            "pop %%"REG_BP"                         \n\t"
1692
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1693

    
1694
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1695
            "a" (&c->redDither)
1696
            );
1697
            return;
1698
        case PIX_FMT_YUYV422:
1699
            asm volatile(
1700
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1701
            "mov        %4, %%"REG_b"               \n\t"
1702
            "push %%"REG_BP"                        \n\t"
1703
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1704
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705
            "pop %%"REG_BP"                         \n\t"
1706
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1707

    
1708
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1709
            "a" (&c->redDither)
1710
            );
1711
            return;
1712
        }
1713
    }
1714
#endif /* HAVE_MMX */
1715
    if (uvalpha < 2048)
1716
    {
1717
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1718
    }else{
1719
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1720
    }
1721
}
1722

    
1723
//FIXME yuy2* can read up to 7 samples too much
1724

    
1725
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1726
{
1727
#ifdef HAVE_MMX
1728
    asm volatile(
1729
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1730
    "mov                    %0, %%"REG_a"       \n\t"
1731
    "1:                                         \n\t"
1732
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1733
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1734
    "pand                %%mm2, %%mm0           \n\t"
1735
    "pand                %%mm2, %%mm1           \n\t"
1736
    "packuswb            %%mm1, %%mm0           \n\t"
1737
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1738
    "add                    $8, %%"REG_a"       \n\t"
1739
    " js                    1b                  \n\t"
1740
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1741
    : "%"REG_a
1742
    );
1743
#else
1744
    int i;
1745
    for (i=0; i<width; i++)
1746
        dst[i]= src[2*i];
1747
#endif
1748
}
1749

    
1750
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1751
{
1752
#ifdef HAVE_MMX
1753
    asm volatile(
1754
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1755
    "mov                    %0, %%"REG_a"       \n\t"
1756
    "1:                                         \n\t"
1757
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1758
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1759
    "psrlw                  $8, %%mm0           \n\t"
1760
    "psrlw                  $8, %%mm1           \n\t"
1761
    "packuswb            %%mm1, %%mm0           \n\t"
1762
    "movq                %%mm0, %%mm1           \n\t"
1763
    "psrlw                  $8, %%mm0           \n\t"
1764
    "pand                %%mm4, %%mm1           \n\t"
1765
    "packuswb            %%mm0, %%mm0           \n\t"
1766
    "packuswb            %%mm1, %%mm1           \n\t"
1767
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1768
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1769
    "add                    $4, %%"REG_a"       \n\t"
1770
    " js                    1b                  \n\t"
1771
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1772
    : "%"REG_a
1773
    );
1774
#else
1775
    int i;
1776
    for (i=0; i<width; i++)
1777
    {
1778
        dstU[i]= src1[4*i + 1];
1779
        dstV[i]= src1[4*i + 3];
1780
    }
1781
#endif
1782
    assert(src1 == src2);
1783
}
1784

    
1785
/* This is almost identical to the previous, end exists only because
1786
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1788
{
1789
#ifdef HAVE_MMX
1790
    asm volatile(
1791
    "mov                  %0, %%"REG_a"         \n\t"
1792
    "1:                                         \n\t"
1793
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1794
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1795
    "psrlw                $8, %%mm0             \n\t"
1796
    "psrlw                $8, %%mm1             \n\t"
1797
    "packuswb          %%mm1, %%mm0             \n\t"
1798
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1799
    "add                  $8, %%"REG_a"         \n\t"
1800
    " js                  1b                    \n\t"
1801
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1802
    : "%"REG_a
1803
    );
1804
#else
1805
    int i;
1806
    for (i=0; i<width; i++)
1807
        dst[i]= src[2*i+1];
1808
#endif
1809
}
1810

    
1811
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1812
{
1813
#ifdef HAVE_MMX
1814
    asm volatile(
1815
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1816
    "mov                    %0, %%"REG_a"       \n\t"
1817
    "1:                                         \n\t"
1818
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1819
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1820
    "pand                %%mm4, %%mm0           \n\t"
1821
    "pand                %%mm4, %%mm1           \n\t"
1822
    "packuswb            %%mm1, %%mm0           \n\t"
1823
    "movq                %%mm0, %%mm1           \n\t"
1824
    "psrlw                  $8, %%mm0           \n\t"
1825
    "pand                %%mm4, %%mm1           \n\t"
1826
    "packuswb            %%mm0, %%mm0           \n\t"
1827
    "packuswb            %%mm1, %%mm1           \n\t"
1828
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1829
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1830
    "add                    $4, %%"REG_a"       \n\t"
1831
    " js                    1b                  \n\t"
1832
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1833
    : "%"REG_a
1834
    );
1835
#else
1836
    int i;
1837
    for (i=0; i<width; i++)
1838
    {
1839
        dstU[i]= src1[4*i + 0];
1840
        dstV[i]= src1[4*i + 2];
1841
    }
1842
#endif
1843
    assert(src1 == src2);
1844
}
1845

    
1846
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1847
{
1848
    int i;
1849
    for (i=0; i<width; i++)
1850
    {
1851
        int b=  ((uint32_t*)src)[i]&0xFF;
1852
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
1853
        int r= (((uint32_t*)src)[i]>>16)&0xFF;
1854

    
1855
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1856
    }
1857
}
1858

    
1859
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1860
{
1861
    int i;
1862
    assert(src1 == src2);
1863
    for (i=0; i<width; i++)
1864
    {
1865
        const int a= ((uint32_t*)src1)[2*i+0];
1866
        const int e= ((uint32_t*)src1)[2*i+1];
1867
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
1868
        const int h= (a&0x00FF00) + (e&0x00FF00);
1869
        const int b=  l&0x3FF;
1870
        const int g=  h>>8;
1871
        const int r=  l>>16;
1872

    
1873
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1874
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1875
    }
1876
}
1877

    
1878
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1879
{
1880
#ifdef HAVE_MMX
1881
    asm volatile(
1882
    "mov                        %2, %%"REG_a"   \n\t"
1883
    "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
1884
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1885
    "pxor                    %%mm7, %%mm7       \n\t"
1886
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1887
    ASMALIGN(4)
1888
    "1:                                         \n\t"
1889
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1890
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1891
    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1892
    "punpcklbw               %%mm7, %%mm0       \n\t"
1893
    "punpcklbw               %%mm7, %%mm1       \n\t"
1894
    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1895
    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1896
    "punpcklbw               %%mm7, %%mm2       \n\t"
1897
    "punpcklbw               %%mm7, %%mm3       \n\t"
1898
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1899
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1900
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1901
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1902
#ifndef FAST_BGR2YV12
1903
    "psrad                      $8, %%mm0       \n\t"
1904
    "psrad                      $8, %%mm1       \n\t"
1905
    "psrad                      $8, %%mm2       \n\t"
1906
    "psrad                      $8, %%mm3       \n\t"
1907
#endif
1908
    "packssdw                %%mm1, %%mm0       \n\t"
1909
    "packssdw                %%mm3, %%mm2       \n\t"
1910
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1911
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1912
    "packssdw                %%mm2, %%mm0       \n\t"
1913
    "psraw                      $7, %%mm0       \n\t"
1914

    
1915
    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1916
    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1917
    "punpcklbw               %%mm7, %%mm4       \n\t"
1918
    "punpcklbw               %%mm7, %%mm1       \n\t"
1919
    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1920
    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1921
    "punpcklbw               %%mm7, %%mm2       \n\t"
1922
    "punpcklbw               %%mm7, %%mm3       \n\t"
1923
    "pmaddwd                 %%mm6, %%mm4       \n\t"
1924
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1925
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1926
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1927
#ifndef FAST_BGR2YV12
1928
    "psrad                      $8, %%mm4       \n\t"
1929
    "psrad                      $8, %%mm1       \n\t"
1930
    "psrad                      $8, %%mm2       \n\t"
1931
    "psrad                      $8, %%mm3       \n\t"
1932
#endif
1933
    "packssdw                %%mm1, %%mm4       \n\t"
1934
    "packssdw                %%mm3, %%mm2       \n\t"
1935
    "pmaddwd                 %%mm5, %%mm4       \n\t"
1936
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1937
    "add                       $24, %%"REG_d"   \n\t"
1938
    "packssdw                %%mm2, %%mm4       \n\t"
1939
    "psraw                      $7, %%mm4       \n\t"
1940

    
1941
    "packuswb                %%mm4, %%mm0       \n\t"
1942
    "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1943

    
1944
    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
1945
    "add                        $8, %%"REG_a"   \n\t"
1946
    " js                        1b              \n\t"
1947
    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1948
    : "%"REG_a, "%"REG_d
1949
    );
1950
#else
1951
    int i;
1952
    for (i=0; i<width; i++)
1953
    {
1954
        int b= src[i*3+0];
1955
        int g= src[i*3+1];
1956
        int r= src[i*3+2];
1957

    
1958
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1959
    }
1960
#endif /* HAVE_MMX */
1961
}
1962

    
1963
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1964
{
1965
#ifdef HAVE_MMX
1966
    asm volatile(
1967
    "mov                        %3, %%"REG_a"   \n\t"
1968
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1969
    "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
1970
    "pxor                    %%mm7, %%mm7       \n\t"
1971
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1972
    "add                 %%"REG_d", %%"REG_d"   \n\t"
1973
    ASMALIGN(4)
1974
    "1:                                         \n\t"
1975
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1976
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1977
    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1978
    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1979
    "movq                    %%mm0, %%mm1       \n\t"
1980
    "movq                    %%mm2, %%mm3       \n\t"
1981
    "psrlq                     $24, %%mm0       \n\t"
1982
    "psrlq                     $24, %%mm2       \n\t"
1983
    PAVGB(%%mm1, %%mm0)
1984
    PAVGB(%%mm3, %%mm2)
1985
    "punpcklbw               %%mm7, %%mm0       \n\t"
1986
    "punpcklbw               %%mm7, %%mm2       \n\t"
1987
#else
1988
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1989
    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1990
    "punpcklbw               %%mm7, %%mm0       \n\t"
1991
    "punpcklbw               %%mm7, %%mm2       \n\t"
1992
    "paddw                   %%mm2, %%mm0       \n\t"
1993
    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1994
    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1995
    "punpcklbw               %%mm7, %%mm4       \n\t"
1996
    "punpcklbw               %%mm7, %%mm2       \n\t"
1997
    "paddw                   %%mm4, %%mm2       \n\t"
1998
    "psrlw                      $1, %%mm0       \n\t"
1999
    "psrlw                      $1, %%mm2       \n\t"
2000
#endif
2001
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2002
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2003

    
2004
    "pmaddwd                 %%mm0, %%mm1       \n\t"
2005
    "pmaddwd                 %%mm2, %%mm3       \n\t"
2006
    "pmaddwd                 %%mm6, %%mm0       \n\t"
2007
    "pmaddwd                 %%mm6, %%mm2       \n\t"
2008
#ifndef FAST_BGR2YV12
2009
    "psrad                      $8, %%mm0       \n\t"
2010
    "psrad                      $8, %%mm1       \n\t"
2011
    "psrad                      $8, %%mm2       \n\t"
2012
    "psrad                      $8, %%mm3       \n\t"
2013
#endif
2014
    "packssdw                %%mm2, %%mm0       \n\t"
2015
    "packssdw                %%mm3, %%mm1       \n\t"
2016
    "pmaddwd                 %%mm5, %%mm0       \n\t"
2017
    "pmaddwd                 %%mm5, %%mm1       \n\t"
2018
    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2019
    "psraw                      $7, %%mm0       \n\t"
2020

    
2021
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2022
    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
2023
    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
2024
    "movq                   %%mm4, %%mm1       \n\t"
2025
    "movq                   %%mm2, %%mm3       \n\t"
2026
    "psrlq                    $24, %%mm4       \n\t"
2027
    "psrlq                    $24, %%mm2       \n\t"
2028
    PAVGB(%%mm1, %%mm4)
2029
    PAVGB(%%mm3, %%mm2)
2030
    "punpcklbw              %%mm7, %%mm4       \n\t"
2031
    "punpcklbw              %%mm7, %%mm2       \n\t"
2032
#else
2033
    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
2034
    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
2035
    "punpcklbw              %%mm7, %%mm4       \n\t"
2036
    "punpcklbw              %%mm7, %%mm2       \n\t"
2037
    "paddw                  %%mm2, %%mm4       \n\t"
2038
    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
2039
    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
2040
    "punpcklbw              %%mm7, %%mm5       \n\t"
2041
    "punpcklbw              %%mm7, %%mm2       \n\t"
2042
    "paddw                  %%mm5, %%mm2       \n\t"
2043
    "movq      "MANGLE(ff_w1111)", %%mm5       \n\t"
2044
    "psrlw                     $2, %%mm4       \n\t"
2045
    "psrlw                     $2, %%mm2       \n\t"
2046
#endif
2047
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2048
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2049

    
2050
    "pmaddwd                %%mm4, %%mm1       \n\t"
2051
    "pmaddwd                %%mm2, %%mm3       \n\t"
2052
    "pmaddwd                %%mm6, %%mm4       \n\t"
2053
    "pmaddwd                %%mm6, %%mm2       \n\t"
2054
#ifndef FAST_BGR2YV12
2055
    "psrad                     $8, %%mm4       \n\t"
2056
    "psrad                     $8, %%mm1       \n\t"
2057
    "psrad                     $8, %%mm2       \n\t"
2058
    "psrad                     $8, %%mm3       \n\t"
2059
#endif
2060
    "packssdw               %%mm2, %%mm4       \n\t"
2061
    "packssdw               %%mm3, %%mm1       \n\t"
2062
    "pmaddwd                %%mm5, %%mm4       \n\t"
2063
    "pmaddwd                %%mm5, %%mm1       \n\t"
2064
    "add                      $24, %%"REG_d"   \n\t"
2065
    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2066
    "psraw                     $7, %%mm4       \n\t"
2067

    
2068
    "movq                   %%mm0, %%mm1       \n\t"
2069
    "punpckldq              %%mm4, %%mm0       \n\t"
2070
    "punpckhdq              %%mm4, %%mm1       \n\t"
2071
    "packsswb               %%mm1, %%mm0       \n\t"
2072
    "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0    \n\t"
2073

    
2074
    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
2075
    "punpckhdq              %%mm0, %%mm0            \n\t"
2076
    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
2077
    "add                       $4, %%"REG_a"        \n\t"
2078
    " js                       1b                   \n\t"
2079
    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2080
    : "%"REG_a, "%"REG_d
2081
    );
2082
#else
2083
    int i;
2084
    for (i=0; i<width; i++)
2085
    {
2086
        int b= src1[6*i + 0] + src1[6*i + 3];
2087
        int g= src1[6*i + 1] + src1[6*i + 4];
2088
        int r= src1[6*i + 2] + src1[6*i + 5];
2089

    
2090
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2091
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2092
    }
2093
#endif /* HAVE_MMX */
2094
    assert(src1 == src2);
2095
}
2096

    
2097
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2098
{
2099
    int i;
2100
    for (i=0; i<width; i++)
2101
    {
2102
        int d= ((uint16_t*)src)[i];
2103
        int b= d&0x1F;
2104
        int g= (d>>5)&0x3F;
2105
        int r= (d>>11)&0x1F;
2106

    
2107
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2108
    }
2109
}
2110

    
2111
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2112
{
2113
    int i;
2114
    assert(src1==src2);
2115
    for (i=0; i<width; i++)
2116
    {
2117
        int d0= ((uint32_t*)src1)[i];
2118

    
2119
        int dl= (d0&0x07E0F81F);
2120
        int dh= ((d0>>5)&0x07C0F83F);
2121

    
2122
        int dh2= (dh>>11) + (dh<<21);
2123
        int d= dh2 + dl;
2124

    
2125
        int b= d&0x7F;
2126
        int r= (d>>11)&0x7F;
2127
        int g= d>>21;
2128
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2129
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2130
    }
2131
}
2132

    
2133
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2134
{
2135
    int i;
2136
    for (i=0; i<width; i++)
2137
    {
2138
        int d= ((uint16_t*)src)[i];
2139
        int b= d&0x1F;
2140
        int g= (d>>5)&0x1F;
2141
        int r= (d>>10)&0x1F;
2142

    
2143
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2144
    }
2145
}
2146

    
2147
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2148
{
2149
    int i;
2150
    assert(src1==src2);
2151
    for (i=0; i<width; i++)
2152
    {
2153
        int d0= ((uint32_t*)src1)[i];
2154

    
2155
        int dl= (d0&0x03E07C1F);
2156
        int dh= ((d0>>5)&0x03E0F81F);
2157

    
2158
        int dh2= (dh>>11) + (dh<<21);
2159
        int d= dh2 + dl;
2160

    
2161
        int b= d&0x7F;
2162
        int r= (d>>10)&0x7F;
2163
        int g= d>>21;
2164
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2165
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2166
    }
2167
}
2168

    
2169

    
2170
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2171
{
2172
    int i;
2173
    for (i=0; i<width; i++)
2174
    {
2175
        int r=  ((uint32_t*)src)[i]&0xFF;
2176
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
2177
        int b= (((uint32_t*)src)[i]>>16)&0xFF;
2178

    
2179
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2180
    }
2181
}
2182

    
2183
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2184
{
2185
    int i;
2186
    assert(src1==src2);
2187
    for (i=0; i<width; i++)
2188
    {
2189
        const int a= ((uint32_t*)src1)[2*i+0];
2190
        const int e= ((uint32_t*)src1)[2*i+1];
2191
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
2192
        const int h= (a&0x00FF00) + (e&0x00FF00);
2193
        const int r=  l&0x3FF;
2194
        const int g=  h>>8;
2195
        const int b=  l>>16;
2196

    
2197
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2198
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2199
    }
2200
}
2201

    
2202
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2203
{
2204
    int i;
2205
    for (i=0; i<width; i++)
2206
    {
2207
        int r= src[i*3+0];
2208
        int g= src[i*3+1];
2209
        int b= src[i*3+2];
2210

    
2211
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2212
    }
2213
}
2214

    
2215
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2216
{
2217
    int i;
2218
    assert(src1==src2);
2219
    for (i=0; i<width; i++)
2220
    {
2221
        int r= src1[6*i + 0] + src1[6*i + 3];
2222
        int g= src1[6*i + 1] + src1[6*i + 4];
2223
        int b= src1[6*i + 2] + src1[6*i + 5];
2224

    
2225
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2226
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2227
    }
2228
}
2229

    
2230
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2231
{
2232
    int i;
2233
    for (i=0; i<width; i++)
2234
    {
2235
        int d= ((uint16_t*)src)[i];
2236
        int r= d&0x1F;
2237
        int g= (d>>5)&0x3F;
2238
        int b= (d>>11)&0x1F;
2239

    
2240
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2241
    }
2242
}
2243

    
2244
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2245
{
2246
    int i;
2247
    assert(src1 == src2);
2248
    for (i=0; i<width; i++)
2249
    {
2250
        int d0= ((uint32_t*)src1)[i];
2251

    
2252
        int dl= (d0&0x07E0F81F);
2253
        int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2254

    
2255
        int r= d&0x3F;
2256
        int b= (d>>11)&0x3F;
2257
        int g= d>>21;
2258
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2259
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2260
    }
2261
}
2262

    
2263
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2264
{
2265
    int i;
2266
    for (i=0; i<width; i++)
2267
    {
2268
        int d= ((uint16_t*)src)[i];
2269
        int r= d&0x1F;
2270
        int g= (d>>5)&0x1F;
2271
        int b= (d>>10)&0x1F;
2272

    
2273
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2274
    }
2275
}
2276

    
2277
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2278
{
2279
    int i;
2280
    assert(src1 == src2);
2281
    for (i=0; i<width; i++)
2282
    {
2283
        int d0= ((uint32_t*)src1)[i];
2284

    
2285
        int dl= (d0&0x03E07C1F);
2286
        int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2287

    
2288
        int r= d&0x3F;
2289
        int b= (d>>10)&0x3F;
2290
        int g= d>>21;
2291
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2292
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2293
    }
2294
}
2295

    
2296
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2297
{
2298
    int i;
2299
    for (i=0; i<width; i++)
2300
    {
2301
        int d= src[i];
2302

    
2303
        dst[i]= pal[d] & 0xFF;
2304
    }
2305
}
2306

    
2307
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2308
{
2309
    int i;
2310
    assert(src1 == src2);
2311
    for (i=0; i<width; i++)
2312
    {
2313
        int p= pal[src1[i]];
2314

    
2315
        dstU[i]= p>>8;
2316
        dstV[i]= p>>16;
2317
    }
2318
}
2319

    
2320
// bilinear / bicubic scaling
2321
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2322
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2323
{
2324
#ifdef HAVE_MMX
2325
    assert(filterSize % 4 == 0 && filterSize>0);
2326
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2327
    {
2328
        long counter= -2*dstW;
2329
        filter-= counter*2;
2330
        filterPos-= counter/2;
2331
        dst-= counter/2;
2332
        asm volatile(
2333
#if defined(PIC)
2334
        "push            %%"REG_b"              \n\t"
2335
#endif
2336
        "pxor                %%mm7, %%mm7       \n\t"
2337
        "movq        "MANGLE(w02)", %%mm6       \n\t"
2338
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2339
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2340
        ASMALIGN(4)
2341
        "1:                                     \n\t"
2342
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2343
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2344
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2345
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2346
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2347
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2348
        "punpcklbw           %%mm7, %%mm0       \n\t"
2349
        "punpcklbw           %%mm7, %%mm2       \n\t"
2350
        "pmaddwd             %%mm1, %%mm0       \n\t"
2351
        "pmaddwd             %%mm2, %%mm3       \n\t"
2352
        "psrad                  $8, %%mm0       \n\t"
2353
        "psrad                  $8, %%mm3       \n\t"
2354
        "packssdw            %%mm3, %%mm0       \n\t"
2355
        "pmaddwd             %%mm6, %%mm0       \n\t"
2356
        "packssdw            %%mm0, %%mm0       \n\t"
2357
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2358
        "add                    $4, %%"REG_BP"  \n\t"
2359
        " jnc                   1b              \n\t"
2360

    
2361
        "pop            %%"REG_BP"              \n\t"
2362
#if defined(PIC)
2363
        "pop             %%"REG_b"              \n\t"
2364
#endif
2365
        : "+a" (counter)
2366
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2367
#if !defined(PIC)
2368
        : "%"REG_b
2369
#endif
2370
        );
2371
    }
2372
    else if (filterSize==8)
2373
    {
2374
        long counter= -2*dstW;
2375
        filter-= counter*4;
2376
        filterPos-= counter/2;
2377
        dst-= counter/2;
2378
        asm volatile(
2379
#if defined(PIC)
2380
        "push             %%"REG_b"             \n\t"
2381
#endif
2382
        "pxor                 %%mm7, %%mm7      \n\t"
2383
        "movq         "MANGLE(w02)", %%mm6      \n\t"
2384
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2385
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2386
        ASMALIGN(4)
2387
        "1:                                     \n\t"
2388
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2389
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2390
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2391
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2392
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2393
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2394
        "punpcklbw            %%mm7, %%mm0      \n\t"
2395
        "punpcklbw            %%mm7, %%mm2      \n\t"
2396
        "pmaddwd              %%mm1, %%mm0      \n\t"
2397
        "pmaddwd              %%mm2, %%mm3      \n\t"
2398

    
2399
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2400
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2401
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2402
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2403
        "punpcklbw            %%mm7, %%mm4      \n\t"
2404
        "punpcklbw            %%mm7, %%mm2      \n\t"
2405
        "pmaddwd              %%mm1, %%mm4      \n\t"
2406
        "pmaddwd              %%mm2, %%mm5      \n\t"
2407
        "paddd                %%mm4, %%mm0      \n\t"
2408
        "paddd                %%mm5, %%mm3      \n\t"
2409

    
2410
        "psrad                   $8, %%mm0      \n\t"
2411
        "psrad                   $8, %%mm3      \n\t"
2412
        "packssdw             %%mm3, %%mm0      \n\t"
2413
        "pmaddwd              %%mm6, %%mm0      \n\t"
2414
        "packssdw             %%mm0, %%mm0      \n\t"
2415
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2416
        "add                     $4, %%"REG_BP" \n\t"
2417
        " jnc                    1b             \n\t"
2418

    
2419
        "pop             %%"REG_BP"             \n\t"
2420
#if defined(PIC)
2421
        "pop              %%"REG_b"             \n\t"
2422
#endif
2423
        : "+a" (counter)
2424
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2425
#if !defined(PIC)
2426
        : "%"REG_b
2427
#endif
2428
        );
2429
    }
2430
    else
2431
    {
2432
        uint8_t *offset = src+filterSize;
2433
        long counter= -2*dstW;
2434
        //filter-= counter*filterSize/2;
2435
        filterPos-= counter/2;
2436
        dst-= counter/2;
2437
        asm volatile(
2438
        "pxor                  %%mm7, %%mm7     \n\t"
2439
        "movq          "MANGLE(w02)", %%mm6     \n\t"
2440
        ASMALIGN(4)
2441
        "1:                                     \n\t"
2442
        "mov                      %2, %%"REG_c" \n\t"
2443
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2444
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2445
        "mov                      %5, %%"REG_c" \n\t"
2446
        "pxor                  %%mm4, %%mm4     \n\t"
2447
        "pxor                  %%mm5, %%mm5     \n\t"
2448
        "2:                                     \n\t"
2449
        "movq                   (%1), %%mm1     \n\t"
2450
        "movq               (%1, %6), %%mm3     \n\t"
2451
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2452
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2453
        "punpcklbw             %%mm7, %%mm0     \n\t"
2454
        "punpcklbw             %%mm7, %%mm2     \n\t"
2455
        "pmaddwd               %%mm1, %%mm0     \n\t"
2456
        "pmaddwd               %%mm2, %%mm3     \n\t"
2457
        "paddd                 %%mm3, %%mm5     \n\t"
2458
        "paddd                 %%mm0, %%mm4     \n\t"
2459
        "add                      $8, %1        \n\t"
2460
        "add                      $4, %%"REG_c" \n\t"
2461
        "cmp                      %4, %%"REG_c" \n\t"
2462
        " jb                      2b            \n\t"
2463
        "add                      %6, %1        \n\t"
2464
        "psrad                    $8, %%mm4     \n\t"
2465
        "psrad                    $8, %%mm5     \n\t"
2466
        "packssdw              %%mm5, %%mm4     \n\t"
2467
        "pmaddwd               %%mm6, %%mm4     \n\t"
2468
        "packssdw              %%mm4, %%mm4     \n\t"
2469
        "mov                      %3, %%"REG_a" \n\t"
2470
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2471
        "add                      $4, %0        \n\t"
2472
        " jnc                     1b            \n\t"
2473

    
2474
        : "+r" (counter), "+r" (filter)
2475
        : "m" (filterPos), "m" (dst), "m"(offset),
2476
          "m" (src), "r" (filterSize*2)
2477
        : "%"REG_a, "%"REG_c, "%"REG_d
2478
        );
2479
    }
2480
#else
2481
#ifdef HAVE_ALTIVEC
2482
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2483
#else
2484
    int i;
2485
    for (i=0; i<dstW; i++)
2486
    {
2487
        int j;
2488
        int srcPos= filterPos[i];
2489
        int val=0;
2490
        //printf("filterPos: %d\n", filterPos[i]);
2491
        for (j=0; j<filterSize; j++)
2492
        {
2493
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2494
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2495
        }
2496
        //filter += hFilterSize;
2497
        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2498
        //dst[i] = val>>7;
2499
    }
2500
#endif /* HAVE_ALTIVEC */
2501
#endif /* HAVE_MMX */
2502
}
2503
      // *** horizontal scale Y line to temp buffer
2504
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2505
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2506
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2507
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2508
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2509
{
2510
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2511
    {
2512
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2513
        src= formatConvBuffer;
2514
    }
2515
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2516
    {
2517
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2518
        src= formatConvBuffer;
2519
    }
2520
    else if (srcFormat==PIX_FMT_RGB32)
2521
    {
2522
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2523
        src= formatConvBuffer;
2524
    }
2525
    else if (srcFormat==PIX_FMT_RGB32_1)
2526
    {
2527
        RENAME(bgr32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2528
        src= formatConvBuffer;
2529
    }
2530
    else if (srcFormat==PIX_FMT_BGR24)
2531
    {
2532
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2533
        src= formatConvBuffer;
2534
    }
2535
    else if (srcFormat==PIX_FMT_BGR565)
2536
    {
2537
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2538
        src= formatConvBuffer;
2539
    }
2540
    else if (srcFormat==PIX_FMT_BGR555)
2541
    {
2542
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2543
        src= formatConvBuffer;
2544
    }
2545
    else if (srcFormat==PIX_FMT_BGR32)
2546
    {
2547
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2548
        src= formatConvBuffer;
2549
    }
2550
    else if (srcFormat==PIX_FMT_BGR32_1)
2551
    {
2552
        RENAME(rgb32ToY)(formatConvBuffer, src+ALT32_CORR, srcW);
2553
        src= formatConvBuffer;
2554
    }
2555
    else if (srcFormat==PIX_FMT_RGB24)
2556
    {
2557
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2558
        src= formatConvBuffer;
2559
    }
2560
    else if (srcFormat==PIX_FMT_RGB565)
2561
    {
2562
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2563
        src= formatConvBuffer;
2564
    }
2565
    else if (srcFormat==PIX_FMT_RGB555)
2566
    {
2567
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2568
        src= formatConvBuffer;
2569
    }
2570
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2571
    {
2572
        RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2573
        src= formatConvBuffer;
2574
    }
2575

    
2576
#ifdef HAVE_MMX
2577
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2578
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2579
#else
2580
    if (!(flags&SWS_FAST_BILINEAR))
2581
#endif
2582
    {
2583
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2584
    }
2585
    else // fast bilinear upscale / crap downscale
2586
    {
2587
#if defined(ARCH_X86)
2588
#ifdef HAVE_MMX2
2589
        int i;
2590
#if defined(PIC)
2591
        uint64_t ebxsave __attribute__((aligned(8)));
2592
#endif
2593
        if (canMMX2BeUsed)
2594
        {
2595
            asm volatile(
2596
#if defined(PIC)
2597
            "mov               %%"REG_b", %5        \n\t"
2598
#endif
2599
            "pxor                  %%mm7, %%mm7     \n\t"
2600
            "mov                      %0, %%"REG_c" \n\t"
2601
            "mov                      %1, %%"REG_D" \n\t"
2602
            "mov                      %2, %%"REG_d" \n\t"
2603
            "mov                      %3, %%"REG_b" \n\t"
2604
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2605
            PREFETCH"        (%%"REG_c")            \n\t"
2606
            PREFETCH"      32(%%"REG_c")            \n\t"
2607
            PREFETCH"      64(%%"REG_c")            \n\t"
2608

    
2609
#ifdef ARCH_X86_64
2610

    
2611
#define FUNNY_Y_CODE \
2612
            "movl            (%%"REG_b"), %%esi     \n\t"\
2613
            "call                    *%4            \n\t"\
2614
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2615
            "add               %%"REG_S", %%"REG_c" \n\t"\
2616
            "add               %%"REG_a", %%"REG_D" \n\t"\
2617
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2618

    
2619
#else
2620

    
2621
#define FUNNY_Y_CODE \
2622
            "movl (%%"REG_b"), %%esi        \n\t"\
2623
            "call         *%4                       \n\t"\
2624
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2625
            "add               %%"REG_a", %%"REG_D" \n\t"\
2626
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2627

    
2628
#endif /* ARCH_X86_64 */
2629

    
2630
FUNNY_Y_CODE
2631
FUNNY_Y_CODE
2632
FUNNY_Y_CODE
2633
FUNNY_Y_CODE
2634
FUNNY_Y_CODE
2635
FUNNY_Y_CODE
2636
FUNNY_Y_CODE
2637
FUNNY_Y_CODE
2638

    
2639
#if defined(PIC)
2640
            "mov                      %5, %%"REG_b" \n\t"
2641
#endif
2642
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2643
            "m" (funnyYCode)
2644
#if defined(PIC)
2645
            ,"m" (ebxsave)
2646
#endif
2647
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2648
#if !defined(PIC)
2649
            ,"%"REG_b
2650
#endif
2651
            );
2652
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2653
        }
2654
        else
2655
        {
2656
#endif /* HAVE_MMX2 */
2657
        long xInc_shr16 = xInc >> 16;
2658
        uint16_t xInc_mask = xInc & 0xffff;
2659
        //NO MMX just normal asm ...
2660
        asm volatile(
2661
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2662
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2663
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2664
        ASMALIGN(4)
2665
        "1:                                  \n\t"
2666
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2667
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2668
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2669
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2670
        "shll      $16, %%edi                \n\t"
2671
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2672
        "mov        %1, %%"REG_D"            \n\t"
2673
        "shrl       $9, %%esi                \n\t"
2674
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2675
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2676
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2677

    
2678
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2679
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2680
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2681
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2682
        "shll      $16, %%edi                \n\t"
2683
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2684
        "mov        %1, %%"REG_D"            \n\t"
2685
        "shrl       $9, %%esi                \n\t"
2686
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2687
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2688
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2689

    
2690

    
2691
        "add        $2, %%"REG_a"            \n\t"
2692
        "cmp        %2, %%"REG_a"            \n\t"
2693
        " jb        1b                       \n\t"
2694

    
2695

    
2696
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2697
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2698
        );
2699
#ifdef HAVE_MMX2
2700
        } //if MMX2 can't be used
2701
#endif
2702
#else
2703
        int i;
2704
        unsigned int xpos=0;
2705
        for (i=0;i<dstWidth;i++)
2706
        {
2707
            register unsigned int xx=xpos>>16;
2708
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2709
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2710
            xpos+=xInc;
2711
        }
2712
#endif /* defined(ARCH_X86) */
2713
    }
2714
}
2715

    
2716
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2717
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2718
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2719
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2720
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2721
{
2722
    if (srcFormat==PIX_FMT_YUYV422)
2723
    {
2724
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2725
        src1= formatConvBuffer;
2726
        src2= formatConvBuffer+VOFW;
2727
    }
2728
    else if (srcFormat==PIX_FMT_UYVY422)
2729
    {
2730
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2731
        src1= formatConvBuffer;
2732
        src2= formatConvBuffer+VOFW;
2733
    }
2734
    else if (srcFormat==PIX_FMT_RGB32)
2735
    {
2736
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2737
        src1= formatConvBuffer;
2738
        src2= formatConvBuffer+VOFW;
2739
    }
2740
    else if (srcFormat==PIX_FMT_RGB32_1)
2741
    {
2742
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2743
        src1= formatConvBuffer;
2744
        src2= formatConvBuffer+VOFW;
2745
    }
2746
    else if (srcFormat==PIX_FMT_BGR24)
2747
    {
2748
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2749
        src1= formatConvBuffer;
2750
        src2= formatConvBuffer+VOFW;
2751
    }
2752
    else if (srcFormat==PIX_FMT_BGR565)
2753
    {
2754
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2755
        src1= formatConvBuffer;
2756
        src2= formatConvBuffer+VOFW;
2757
    }
2758
    else if (srcFormat==PIX_FMT_BGR555)
2759
    {
2760
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2761
        src1= formatConvBuffer;
2762
        src2= formatConvBuffer+VOFW;
2763
    }
2764
    else if (srcFormat==PIX_FMT_BGR32)
2765
    {
2766
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2767
        src1= formatConvBuffer;
2768
        src2= formatConvBuffer+VOFW;
2769
    }
2770
    else if (srcFormat==PIX_FMT_BGR32_1)
2771
    {
2772
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1+ALT32_CORR, src2+ALT32_CORR, srcW);
2773
        src1= formatConvBuffer;
2774
        src2= formatConvBuffer+VOFW;
2775
    }
2776
    else if (srcFormat==PIX_FMT_RGB24)
2777
    {
2778
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2779
        src1= formatConvBuffer;
2780
        src2= formatConvBuffer+VOFW;
2781
    }
2782
    else if (srcFormat==PIX_FMT_RGB565)
2783
    {
2784
        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2785
        src1= formatConvBuffer;
2786
        src2= formatConvBuffer+VOFW;
2787
    }
2788
    else if (srcFormat==PIX_FMT_RGB555)
2789
    {
2790
        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2791
        src1= formatConvBuffer;
2792
        src2= formatConvBuffer+VOFW;
2793
    }
2794
    else if (isGray(srcFormat))
2795
    {
2796
        return;
2797
    }
2798
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2799
    {
2800
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2801
        src1= formatConvBuffer;
2802
        src2= formatConvBuffer+VOFW;
2803
    }
2804

    
2805
#ifdef HAVE_MMX
2806
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2807
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2808
#else
2809
    if (!(flags&SWS_FAST_BILINEAR))
2810
#endif
2811
    {
2812
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2813
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2814
    }
2815
    else // fast bilinear upscale / crap downscale
2816
    {
2817
#if defined(ARCH_X86)
2818
#ifdef HAVE_MMX2
2819
        int i;
2820
#if defined(PIC)
2821
        uint64_t ebxsave __attribute__((aligned(8)));
2822
#endif
2823
        if (canMMX2BeUsed)
2824
        {
2825
            asm volatile(
2826
#if defined(PIC)
2827
            "mov          %%"REG_b", %6         \n\t"
2828
#endif
2829
            "pxor             %%mm7, %%mm7      \n\t"
2830
            "mov                 %0, %%"REG_c"  \n\t"
2831
            "mov                 %1, %%"REG_D"  \n\t"
2832
            "mov                 %2, %%"REG_d"  \n\t"
2833
            "mov                 %3, %%"REG_b"  \n\t"
2834
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2835
            PREFETCH"   (%%"REG_c")             \n\t"
2836
            PREFETCH" 32(%%"REG_c")             \n\t"
2837
            PREFETCH" 64(%%"REG_c")             \n\t"
2838

    
2839
#ifdef ARCH_X86_64
2840

    
2841
#define FUNNY_UV_CODE \
2842
            "movl       (%%"REG_b"), %%esi      \n\t"\
2843
            "call               *%4             \n\t"\
2844
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2845
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2846
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2847
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2848

    
2849
#else
2850

    
2851
#define FUNNY_UV_CODE \
2852
            "movl       (%%"REG_b"), %%esi      \n\t"\
2853
            "call               *%4             \n\t"\
2854
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2855
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2856
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2857

    
2858
#endif /* ARCH_X86_64 */
2859

    
2860
FUNNY_UV_CODE
2861
FUNNY_UV_CODE
2862
FUNNY_UV_CODE
2863
FUNNY_UV_CODE
2864
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2865
            "mov                 %5, %%"REG_c"  \n\t" // src
2866
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2867
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2868
            PREFETCH"   (%%"REG_c")             \n\t"
2869
            PREFETCH" 32(%%"REG_c")             \n\t"
2870
            PREFETCH" 64(%%"REG_c")             \n\t"
2871

    
2872
FUNNY_UV_CODE
2873
FUNNY_UV_CODE
2874
FUNNY_UV_CODE
2875
FUNNY_UV_CODE
2876

    
2877
#if defined(PIC)
2878
            "mov %6, %%"REG_b"    \n\t"
2879
#endif
2880
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2881
            "m" (funnyUVCode), "m" (src2)
2882
#if defined(PIC)
2883
            ,"m" (ebxsave)
2884
#endif
2885
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2886
#if !defined(PIC)
2887
             ,"%"REG_b
2888
#endif
2889
            );
2890
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2891
            {
2892
                //printf("%d %d %d\n", dstWidth, i, srcW);
2893
                dst[i] = src1[srcW-1]*128;
2894
                dst[i+VOFW] = src2[srcW-1]*128;
2895
            }
2896
        }
2897
        else
2898
        {
2899
#endif /* HAVE_MMX2 */
2900
            long xInc_shr16 = (long) (xInc >> 16);
2901
            uint16_t xInc_mask = xInc & 0xffff;
2902
            asm volatile(
2903
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2904
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2905
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2906
            ASMALIGN(4)
2907
            "1:                                     \n\t"
2908
            "mov        %0, %%"REG_S"               \n\t"
2909
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2910
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2911
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2912
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2913
            "shll      $16, %%edi                   \n\t"
2914
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2915
            "mov        %1, %%"REG_D"               \n\t"
2916
            "shrl       $9, %%esi                   \n\t"
2917
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2918

    
2919
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2920
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2921
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2922
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2923
            "shll      $16, %%edi                   \n\t"
2924
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2925
            "mov        %1, %%"REG_D"               \n\t"
2926
            "shrl       $9, %%esi                   \n\t"
2927
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2928

    
2929
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2930
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2931
            "add        $1, %%"REG_a"               \n\t"
2932
            "cmp        %2, %%"REG_a"               \n\t"
2933
            " jb        1b                          \n\t"
2934

    
2935
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2936
   which is needed to support GCC 4.0. */
2937
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2938
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2939
#else
2940
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2941
#endif
2942
            "r" (src2)
2943
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2944
            );
2945
#ifdef HAVE_MMX2
2946
        } //if MMX2 can't be used
2947
#endif
2948
#else
2949
        int i;
2950
        unsigned int xpos=0;
2951
        for (i=0;i<dstWidth;i++)
2952
        {
2953
            register unsigned int xx=xpos>>16;
2954
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2955
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2956
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2957
            /* slower
2958
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2959
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2960
            */
2961
            xpos+=xInc;
2962
        }
2963
#endif /* defined(ARCH_X86) */
2964
    }
2965
}
2966

    
2967
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2968
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2969

    
2970
    /* load a few things into local vars to make the code more readable? and faster */
2971
    const int srcW= c->srcW;
2972
    const int dstW= c->dstW;
2973
    const int dstH= c->dstH;
2974
    const int chrDstW= c->chrDstW;
2975
    const int chrSrcW= c->chrSrcW;
2976
    const int lumXInc= c->lumXInc;
2977
    const int chrXInc= c->chrXInc;
2978
    const int dstFormat= c->dstFormat;
2979
    const int srcFormat= c->srcFormat;
2980
    const int flags= c->flags;
2981
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2982
    int16_t *vLumFilterPos= c->vLumFilterPos;
2983
    int16_t *vChrFilterPos= c->vChrFilterPos;
2984
    int16_t *hLumFilterPos= c->hLumFilterPos;
2985
    int16_t *hChrFilterPos= c->hChrFilterPos;
2986
    int16_t *vLumFilter= c->vLumFilter;
2987
    int16_t *vChrFilter= c->vChrFilter;
2988
    int16_t *hLumFilter= c->hLumFilter;
2989
    int16_t *hChrFilter= c->hChrFilter;
2990
    int32_t *lumMmxFilter= c->lumMmxFilter;
2991
    int32_t *chrMmxFilter= c->chrMmxFilter;
2992
    const int vLumFilterSize= c->vLumFilterSize;
2993
    const int vChrFilterSize= c->vChrFilterSize;
2994
    const int hLumFilterSize= c->hLumFilterSize;
2995
    const int hChrFilterSize= c->hChrFilterSize;
2996
    int16_t **lumPixBuf= c->lumPixBuf;
2997
    int16_t **chrPixBuf= c->chrPixBuf;
2998
    const int vLumBufSize= c->vLumBufSize;
2999
    const int vChrBufSize= c->vChrBufSize;
3000
    uint8_t *funnyYCode= c->funnyYCode;
3001
    uint8_t *funnyUVCode= c->funnyUVCode;
3002
    uint8_t *formatConvBuffer= c->formatConvBuffer;
3003
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
3004
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
3005
    int lastDstY;
3006
    uint8_t *pal=NULL;
3007

    
3008
    /* vars which will change and which we need to store back in the context */
3009
    int dstY= c->dstY;
3010
    int lumBufIndex= c->lumBufIndex;
3011
    int chrBufIndex= c->chrBufIndex;
3012
    int lastInLumBuf= c->lastInLumBuf;
3013
    int lastInChrBuf= c->lastInChrBuf;
3014

    
3015
    if (isPacked(c->srcFormat)){
3016
        pal= src[1];
3017
        src[0]=
3018
        src[1]=
3019
        src[2]= src[0];
3020
        srcStride[0]=
3021
        srcStride[1]=
3022
        srcStride[2]= srcStride[0];
3023
    }
3024
    srcStride[1]<<= c->vChrDrop;
3025
    srcStride[2]<<= c->vChrDrop;
3026

    
3027
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
3028
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
3029

    
3030
#if 0 //self test FIXME move to a vfilter or something
3031
    {
3032
    static volatile int i=0;
3033
    i++;
3034
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
3035
        selfTest(src, srcStride, c->srcW, c->srcH);
3036
    i--;
3037
    }
3038
#endif
3039

    
3040
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3041
    //dstStride[0],dstStride[1],dstStride[2]);
3042

    
3043
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3044
    {
3045
        static int firstTime=1; //FIXME move this into the context perhaps
3046
        if (flags & SWS_PRINT_INFO && firstTime)
3047
        {
3048
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3049
                   "         ->cannot do aligned memory accesses anymore\n");
3050
            firstTime=0;
3051
        }
3052
    }
3053

    
3054
    /* Note the user might start scaling the picture in the middle so this
3055
       will not get executed. This is not really intended but works
3056
       currently, so people might do it. */
3057
    if (srcSliceY ==0){
3058
        lumBufIndex=0;
3059
        chrBufIndex=0;
3060
        dstY=0;
3061
        lastInLumBuf= -1;
3062
        lastInChrBuf= -1;
3063
    }
3064

    
3065
    lastDstY= dstY;
3066

    
3067
    for (;dstY < dstH; dstY++){
3068
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
3069
        const int chrDstY= dstY>>c->chrDstVSubSample;
3070
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3071
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3072

    
3073
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3074
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3075
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3076
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3077

    
3078
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3079
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3080
        //handle holes (FAST_BILINEAR & weird filters)
3081
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3082
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3083
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3084
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3085
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3086

    
3087
        // Do we have enough lines in this slice to output the dstY line
3088
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3089
        {
3090
            //Do horizontal scaling
3091
            while(lastInLumBuf < lastLumSrcY)
3092
            {
3093
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3094
                lumBufIndex++;
3095
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3096
                assert(lumBufIndex < 2*vLumBufSize);
3097
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3098
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3099
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3100
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3101
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3102
                                funnyYCode, c->srcFormat, formatConvBuffer,
3103
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3104
                lastInLumBuf++;
3105
            }
3106
            while(lastInChrBuf < lastChrSrcY)
3107
            {
3108
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3109
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3110
                chrBufIndex++;
3111
                assert(chrBufIndex < 2*vChrBufSize);
3112
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3113
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3114
                //FIXME replace parameters through context struct (some at least)
3115

    
3116
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3117
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3118
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3119
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3120
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3121
                lastInChrBuf++;
3122
            }
3123
            //wrap buf index around to stay inside the ring buffer
3124
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3125
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3126
        }
3127
        else // not enough lines left in this slice -> load the rest in the buffer
3128
        {
3129
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3130
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3131
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3132
            vChrBufSize, vLumBufSize);*/
3133

    
3134
            //Do horizontal scaling
3135
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3136
            {
3137
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3138
                lumBufIndex++;
3139
                assert(lumBufIndex < 2*vLumBufSize);
3140
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3141
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3142
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3143
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3144
                                funnyYCode, c->srcFormat, formatConvBuffer,
3145
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3146
                lastInLumBuf++;
3147
            }
3148
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3149
            {
3150
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3151
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3152
                chrBufIndex++;
3153
                assert(chrBufIndex < 2*vChrBufSize);
3154
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3155
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3156

    
3157
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3158
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3159
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3160
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3161
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3162
                lastInChrBuf++;
3163
            }
3164
            //wrap buf index around to stay inside the ring buffer
3165
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3166
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3167
            break; //we can't output a dstY line so let's try with the next slice
3168
        }
3169

    
3170
#ifdef HAVE_MMX
3171
        b5Dither= ff_dither8[dstY&1];
3172
        g6Dither= ff_dither4[dstY&1];
3173
        g5Dither= ff_dither8[dstY&1];
3174
        r5Dither= ff_dither8[(dstY+1)&1];
3175
#endif
3176
        if (dstY < dstH-2)
3177
        {
3178
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3179
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3180
#ifdef HAVE_MMX
3181
            int i;
3182
        if (flags & SWS_ACCURATE_RND){
3183
            for (i=0; i<vLumFilterSize; i+=2){
3184
                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
3185
                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3186
                lumMmxFilter[2*i+2]=
3187
                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3188
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3189
            }
3190
            for (i=0; i<vChrFilterSize; i+=2){
3191
                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
3192
                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3193
                chrMmxFilter[2*i+2]=
3194
                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3195
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3196
            }
3197
        }else{
3198
            for (i=0; i<vLumFilterSize; i++)
3199
            {
3200
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3201
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3202
                lumMmxFilter[4*i+2]=
3203
                lumMmxFilter[4*i+3]=
3204
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3205
            }
3206
            for (i=0; i<vChrFilterSize; i++)
3207
            {
3208
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3209
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3210
                chrMmxFilter[4*i+2]=
3211
                chrMmxFilter[4*i+3]=
3212
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3213
            }
3214
        }
3215
#endif
3216
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3217
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3218
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3219
                RENAME(yuv2nv12X)(c,
3220
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3221
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3222
                    dest, uDest, dstW, chrDstW, dstFormat);
3223
            }
3224
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3225
            {
3226
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3227
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3228
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3229
                {
3230
                    int16_t *lumBuf = lumPixBuf[0];
3231
                    int16_t *chrBuf= chrPixBuf[0];
3232
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3233
                }
3234
                else //General YV12
3235
                {
3236
                    RENAME(yuv2yuvX)(c,
3237
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3238
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3239
                        dest, uDest, vDest, dstW, chrDstW);
3240
                }
3241
            }
3242
            else
3243
            {
3244
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3245
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3246
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3247
                {
3248
                    int chrAlpha= vChrFilter[2*dstY+1];
3249
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3250
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3251
                }
3252
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3253
                {
3254
                    int lumAlpha= vLumFilter[2*dstY+1];
3255
                    int chrAlpha= vChrFilter[2*dstY+1];
3256
                    lumMmxFilter[2]=
3257
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3258
                    chrMmxFilter[2]=
3259
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3260
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
</