Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ bf2bdde6

History | View | Annotate | Download (134 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                                $16, %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
#define YSCALEYUV2YV121_ACCURATE \
185
    "mov %2, %%"REG_a"                    \n\t"\
186
    "pcmpeqw %%mm7, %%mm7                 \n\t"\
187
    "psrlw                 $15, %%mm7     \n\t"\
188
    "psllw                  $6, %%mm7     \n\t"\
189
    ASMALIGN(4) /* FIXME Unroll? */\
190
    "1:                                   \n\t"\
191
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
192
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
193
    "paddw              %%mm7, %%mm0      \n\t"\
194
    "paddw              %%mm7, %%mm1      \n\t"\
195
    "psraw                 $7, %%mm0      \n\t"\
196
    "psraw                 $7, %%mm1      \n\t"\
197
    "packuswb           %%mm1, %%mm0      \n\t"\
198
    MOVNTQ(%%mm0, (%1, %%REGa))\
199
    "add                   $8, %%"REG_a"  \n\t"\
200
    "jnc                   1b             \n\t"
201

    
202
/*
203
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
204
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
205
       "r" (dest), "m" (dstW),
206
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
207
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
208
*/
209
#define YSCALEYUV2PACKEDX \
210
    asm volatile(\
211
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
212
    ASMALIGN(4)\
213
    "nop                                            \n\t"\
214
    "1:                                             \n\t"\
215
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
218
    "movq                      %%mm3, %%mm4         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
223
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
224
    "add                         $16, %%"REG_d"     \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm3         \n\t"\
229
    "paddw                     %%mm5, %%mm4         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232
\
233
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
234
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
235
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
236
    "movq                      %%mm1, %%mm7         \n\t"\
237
    ASMALIGN(4)\
238
    "2:                                             \n\t"\
239
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
242
    "add                         $16, %%"REG_d"            \n\t"\
243
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
244
    "pmulhw                    %%mm0, %%mm2         \n\t"\
245
    "pmulhw                    %%mm0, %%mm5         \n\t"\
246
    "paddw                     %%mm2, %%mm1         \n\t"\
247
    "paddw                     %%mm5, %%mm7         \n\t"\
248
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
249
    " jnz                         2b                \n\t"\
250

    
251
#define YSCALEYUV2PACKEDX_END                 \
252
    :: "r" (&c->redDither),                   \
253
        "m" (dummy), "m" (dummy), "m" (dummy),\
254
        "r" (dest), "m" (dstW)                \
255
    : "%"REG_a, "%"REG_d, "%"REG_S            \
256
    );
257

    
258
#define YSCALEYUV2PACKEDX_ACCURATE \
259
    asm volatile(\
260
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
261
    ASMALIGN(4)\
262
    "nop                                            \n\t"\
263
    "1:                                             \n\t"\
264
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
265
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
266
    "pxor                      %%mm4, %%mm4         \n\t"\
267
    "pxor                      %%mm5, %%mm5         \n\t"\
268
    "pxor                      %%mm6, %%mm6         \n\t"\
269
    "pxor                      %%mm7, %%mm7         \n\t"\
270
    ASMALIGN(4)\
271
    "2:                                             \n\t"\
272
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
273
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
274
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
275
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
276
    "movq                      %%mm0, %%mm3         \n\t"\
277
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
278
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
279
    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
280
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
281
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
282
    "paddd                     %%mm0, %%mm4         \n\t"\
283
    "paddd                     %%mm3, %%mm5         \n\t"\
284
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
285
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
286
    "add                         $16, %%"REG_d"     \n\t"\
287
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
288
    "movq                      %%mm2, %%mm0         \n\t"\
289
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
290
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
291
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
292
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
293
    "paddd                     %%mm2, %%mm6         \n\t"\
294
    "paddd                     %%mm0, %%mm7         \n\t"\
295
    " jnz                         2b                \n\t"\
296
    "psrad                       $16, %%mm4         \n\t"\
297
    "psrad                       $16, %%mm5         \n\t"\
298
    "psrad                       $16, %%mm6         \n\t"\
299
    "psrad                       $16, %%mm7         \n\t"\
300
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
301
    "packssdw                  %%mm5, %%mm4         \n\t"\
302
    "packssdw                  %%mm7, %%mm6         \n\t"\
303
    "paddw                     %%mm0, %%mm4         \n\t"\
304
    "paddw                     %%mm0, %%mm6         \n\t"\
305
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
306
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
307
\
308
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
309
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
310
    "pxor                      %%mm1, %%mm1         \n\t"\
311
    "pxor                      %%mm5, %%mm5         \n\t"\
312
    "pxor                      %%mm7, %%mm7         \n\t"\
313
    "pxor                      %%mm6, %%mm6         \n\t"\
314
    ASMALIGN(4)\
315
    "2:                                             \n\t"\
316
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
317
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
318
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
319
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
320
    "movq                      %%mm0, %%mm3         \n\t"\
321
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
322
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
323
    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
324
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
325
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
326
    "paddd                     %%mm0, %%mm1         \n\t"\
327
    "paddd                     %%mm3, %%mm5         \n\t"\
328
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
329
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
330
    "add                         $16, %%"REG_d"     \n\t"\
331
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
332
    "movq                      %%mm2, %%mm0         \n\t"\
333
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
334
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
335
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
336
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
337
    "paddd                     %%mm2, %%mm7         \n\t"\
338
    "paddd                     %%mm0, %%mm6         \n\t"\
339
    " jnz                         2b                \n\t"\
340
    "psrad                       $16, %%mm1         \n\t"\
341
    "psrad                       $16, %%mm5         \n\t"\
342
    "psrad                       $16, %%mm7         \n\t"\
343
    "psrad                       $16, %%mm6         \n\t"\
344
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
345
    "packssdw                  %%mm5, %%mm1         \n\t"\
346
    "packssdw                  %%mm6, %%mm7         \n\t"\
347
    "paddw                     %%mm0, %%mm1         \n\t"\
348
    "paddw                     %%mm0, %%mm7         \n\t"\
349
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
350
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
351

    
352
#define YSCALEYUV2RGBX \
353
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
354
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
355
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
356
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
357
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
358
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
359
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
360
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
361
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
362
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
363
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
364
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
365
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
366
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
367
    "paddw           %%mm3, %%mm4       \n\t"\
368
    "movq            %%mm2, %%mm0       \n\t"\
369
    "movq            %%mm5, %%mm6       \n\t"\
370
    "movq            %%mm4, %%mm3       \n\t"\
371
    "punpcklwd       %%mm2, %%mm2       \n\t"\
372
    "punpcklwd       %%mm5, %%mm5       \n\t"\
373
    "punpcklwd       %%mm4, %%mm4       \n\t"\
374
    "paddw           %%mm1, %%mm2       \n\t"\
375
    "paddw           %%mm1, %%mm5       \n\t"\
376
    "paddw           %%mm1, %%mm4       \n\t"\
377
    "punpckhwd       %%mm0, %%mm0       \n\t"\
378
    "punpckhwd       %%mm6, %%mm6       \n\t"\
379
    "punpckhwd       %%mm3, %%mm3       \n\t"\
380
    "paddw           %%mm7, %%mm0       \n\t"\
381
    "paddw           %%mm7, %%mm6       \n\t"\
382
    "paddw           %%mm7, %%mm3       \n\t"\
383
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
384
    "packuswb        %%mm0, %%mm2       \n\t"\
385
    "packuswb        %%mm6, %%mm5       \n\t"\
386
    "packuswb        %%mm3, %%mm4       \n\t"\
387
    "pxor            %%mm7, %%mm7       \n\t"
388
#if 0
389
#define FULL_YSCALEYUV2RGB \
390
    "pxor                 %%mm7, %%mm7  \n\t"\
391
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
392
    "punpcklwd            %%mm6, %%mm6  \n\t"\
393
    "punpcklwd            %%mm6, %%mm6  \n\t"\
394
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
395
    "punpcklwd            %%mm5, %%mm5  \n\t"\
396
    "punpcklwd            %%mm5, %%mm5  \n\t"\
397
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
398
    ASMALIGN(4)\
399
    "1:                                 \n\t"\
400
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
401
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
402
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
403
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
404
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
405
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
406
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
407
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
408
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
409
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
410
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
411
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
412
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
413
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
414
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
415
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
416
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
417
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
418
\
419
\
420
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
421
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
422
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
423
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
424
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
425
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
426
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
427
\
428
\
429
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
430
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
431
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
432
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
433
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
434
    "packuswb             %%mm3, %%mm3  \n\t"\
435
\
436
    "packuswb             %%mm0, %%mm0  \n\t"\
437
    "paddw                %%mm4, %%mm2  \n\t"\
438
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
439
\
440
    "packuswb             %%mm1, %%mm1  \n\t"
441
#endif
442

    
443
#define REAL_YSCALEYUV2PACKED(index, c) \
444
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
445
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
446
    "psraw                $3, %%mm0                           \n\t"\
447
    "psraw                $3, %%mm1                           \n\t"\
448
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
449
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
450
    "xor            "#index", "#index"                        \n\t"\
451
    ASMALIGN(4)\
452
    "1:                                 \n\t"\
453
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
454
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
455
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
456
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
457
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
458
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
459
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
460
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
461
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
462
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
463
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
464
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
465
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
466
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
467
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
468
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
469
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
470
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
471
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
472
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
473
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
474
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
475
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
476
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
477
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
478

    
479
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
480

    
481
#define REAL_YSCALEYUV2RGB(index, c) \
482
    "xor            "#index", "#index"  \n\t"\
483
    ASMALIGN(4)\
484
    "1:                                 \n\t"\
485
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
486
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
487
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
488
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
489
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
490
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
491
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
492
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
493
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
494
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
495
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
496
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
497
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
498
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
499
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
500
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
501
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
502
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
503
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
504
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
505
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
506
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
507
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
508
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
509
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
510
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
511
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
512
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
513
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
514
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
515
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
516
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
517
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
518
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
519
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
520
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
521
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
522
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
523
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
524
    "paddw             %%mm3, %%mm4     \n\t"\
525
    "movq              %%mm2, %%mm0     \n\t"\
526
    "movq              %%mm5, %%mm6     \n\t"\
527
    "movq              %%mm4, %%mm3     \n\t"\
528
    "punpcklwd         %%mm2, %%mm2     \n\t"\
529
    "punpcklwd         %%mm5, %%mm5     \n\t"\
530
    "punpcklwd         %%mm4, %%mm4     \n\t"\
531
    "paddw             %%mm1, %%mm2     \n\t"\
532
    "paddw             %%mm1, %%mm5     \n\t"\
533
    "paddw             %%mm1, %%mm4     \n\t"\
534
    "punpckhwd         %%mm0, %%mm0     \n\t"\
535
    "punpckhwd         %%mm6, %%mm6     \n\t"\
536
    "punpckhwd         %%mm3, %%mm3     \n\t"\
537
    "paddw             %%mm7, %%mm0     \n\t"\
538
    "paddw             %%mm7, %%mm6     \n\t"\
539
    "paddw             %%mm7, %%mm3     \n\t"\
540
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
541
    "packuswb          %%mm0, %%mm2     \n\t"\
542
    "packuswb          %%mm6, %%mm5     \n\t"\
543
    "packuswb          %%mm3, %%mm4     \n\t"\
544
    "pxor              %%mm7, %%mm7     \n\t"
545
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
546

    
547
#define REAL_YSCALEYUV2PACKED1(index, c) \
548
    "xor            "#index", "#index"  \n\t"\
549
    ASMALIGN(4)\
550
    "1:                                 \n\t"\
551
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
552
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
553
    "psraw                $7, %%mm3     \n\t" \
554
    "psraw                $7, %%mm4     \n\t" \
555
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
556
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
557
    "psraw                $7, %%mm1     \n\t" \
558
    "psraw                $7, %%mm7     \n\t" \
559

    
560
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
561

    
562
#define REAL_YSCALEYUV2RGB1(index, c) \
563
    "xor            "#index", "#index"  \n\t"\
564
    ASMALIGN(4)\
565
    "1:                                 \n\t"\
566
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
567
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
568
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
569
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
570
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
571
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
572
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
573
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
574
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
575
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
576
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
577
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
578
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
579
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
580
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
581
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
582
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
583
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
584
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
585
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
586
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
587
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
588
    "paddw             %%mm3, %%mm4     \n\t"\
589
    "movq              %%mm2, %%mm0     \n\t"\
590
    "movq              %%mm5, %%mm6     \n\t"\
591
    "movq              %%mm4, %%mm3     \n\t"\
592
    "punpcklwd         %%mm2, %%mm2     \n\t"\
593
    "punpcklwd         %%mm5, %%mm5     \n\t"\
594
    "punpcklwd         %%mm4, %%mm4     \n\t"\
595
    "paddw             %%mm1, %%mm2     \n\t"\
596
    "paddw             %%mm1, %%mm5     \n\t"\
597
    "paddw             %%mm1, %%mm4     \n\t"\
598
    "punpckhwd         %%mm0, %%mm0     \n\t"\
599
    "punpckhwd         %%mm6, %%mm6     \n\t"\
600
    "punpckhwd         %%mm3, %%mm3     \n\t"\
601
    "paddw             %%mm7, %%mm0     \n\t"\
602
    "paddw             %%mm7, %%mm6     \n\t"\
603
    "paddw             %%mm7, %%mm3     \n\t"\
604
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
605
    "packuswb          %%mm0, %%mm2     \n\t"\
606
    "packuswb          %%mm6, %%mm5     \n\t"\
607
    "packuswb          %%mm3, %%mm4     \n\t"\
608
    "pxor              %%mm7, %%mm7     \n\t"
609
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
610

    
611
#define REAL_YSCALEYUV2PACKED1b(index, c) \
612
    "xor "#index", "#index"             \n\t"\
613
    ASMALIGN(4)\
614
    "1:                                 \n\t"\
615
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
616
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
617
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
618
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
619
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
620
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
621
    "psrlw                $8, %%mm3     \n\t" \
622
    "psrlw                $8, %%mm4     \n\t" \
623
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
624
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
625
    "psraw                $7, %%mm1     \n\t" \
626
    "psraw                $7, %%mm7     \n\t"
627
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
628

    
629
// do vertical chrominance interpolation
630
#define REAL_YSCALEYUV2RGB1b(index, c) \
631
    "xor            "#index", "#index"  \n\t"\
632
    ASMALIGN(4)\
633
    "1:                                 \n\t"\
634
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
635
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
636
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
637
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
638
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
639
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
640
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
641
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
642
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
643
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
644
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
645
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
646
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
647
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
648
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
649
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
650
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
651
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
652
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
653
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
654
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
655
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
656
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
657
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
658
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
659
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
660
    "paddw             %%mm3, %%mm4     \n\t"\
661
    "movq              %%mm2, %%mm0     \n\t"\
662
    "movq              %%mm5, %%mm6     \n\t"\
663
    "movq              %%mm4, %%mm3     \n\t"\
664
    "punpcklwd         %%mm2, %%mm2     \n\t"\
665
    "punpcklwd         %%mm5, %%mm5     \n\t"\
666
    "punpcklwd         %%mm4, %%mm4     \n\t"\
667
    "paddw             %%mm1, %%mm2     \n\t"\
668
    "paddw             %%mm1, %%mm5     \n\t"\
669
    "paddw             %%mm1, %%mm4     \n\t"\
670
    "punpckhwd         %%mm0, %%mm0     \n\t"\
671
    "punpckhwd         %%mm6, %%mm6     \n\t"\
672
    "punpckhwd         %%mm3, %%mm3     \n\t"\
673
    "paddw             %%mm7, %%mm0     \n\t"\
674
    "paddw             %%mm7, %%mm6     \n\t"\
675
    "paddw             %%mm7, %%mm3     \n\t"\
676
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
677
    "packuswb          %%mm0, %%mm2     \n\t"\
678
    "packuswb          %%mm6, %%mm5     \n\t"\
679
    "packuswb          %%mm3, %%mm4     \n\t"\
680
    "pxor              %%mm7, %%mm7     \n\t"
681
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
682

    
683
#define REAL_WRITEBGR32(dst, dstw, index) \
684
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
685
    "movq      %%mm2, %%mm1     \n\t" /* B */\
686
    "movq      %%mm5, %%mm6     \n\t" /* R */\
687
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
688
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
689
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
690
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
691
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
692
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
693
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
694
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
695
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
696
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
697
\
698
    MOVNTQ(%%mm0,   (dst, index, 4))\
699
    MOVNTQ(%%mm2,  8(dst, index, 4))\
700
    MOVNTQ(%%mm1, 16(dst, index, 4))\
701
    MOVNTQ(%%mm3, 24(dst, index, 4))\
702
\
703
    "add      $8, "#index"      \n\t"\
704
    "cmp "#dstw", "#index"      \n\t"\
705
    " jb      1b                \n\t"
706
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
707

    
708
#define REAL_WRITEBGR16(dst, dstw, index) \
709
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
710
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
711
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
712
    "psrlq           $3, %%mm2  \n\t"\
713
\
714
    "movq         %%mm2, %%mm1  \n\t"\
715
    "movq         %%mm4, %%mm3  \n\t"\
716
\
717
    "punpcklbw    %%mm7, %%mm3  \n\t"\
718
    "punpcklbw    %%mm5, %%mm2  \n\t"\
719
    "punpckhbw    %%mm7, %%mm4  \n\t"\
720
    "punpckhbw    %%mm5, %%mm1  \n\t"\
721
\
722
    "psllq           $3, %%mm3  \n\t"\
723
    "psllq           $3, %%mm4  \n\t"\
724
\
725
    "por          %%mm3, %%mm2  \n\t"\
726
    "por          %%mm4, %%mm1  \n\t"\
727
\
728
    MOVNTQ(%%mm2,  (dst, index, 2))\
729
    MOVNTQ(%%mm1, 8(dst, index, 2))\
730
\
731
    "add             $8, "#index"   \n\t"\
732
    "cmp        "#dstw", "#index"   \n\t"\
733
    " jb             1b             \n\t"
734
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
735

    
736
#define REAL_WRITEBGR15(dst, dstw, index) \
737
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
738
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
739
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
740
    "psrlq           $3, %%mm2  \n\t"\
741
    "psrlq           $1, %%mm5  \n\t"\
742
\
743
    "movq         %%mm2, %%mm1  \n\t"\
744
    "movq         %%mm4, %%mm3  \n\t"\
745
\
746
    "punpcklbw    %%mm7, %%mm3  \n\t"\
747
    "punpcklbw    %%mm5, %%mm2  \n\t"\
748
    "punpckhbw    %%mm7, %%mm4  \n\t"\
749
    "punpckhbw    %%mm5, %%mm1  \n\t"\
750
\
751
    "psllq           $2, %%mm3  \n\t"\
752
    "psllq           $2, %%mm4  \n\t"\
753
\
754
    "por          %%mm3, %%mm2  \n\t"\
755
    "por          %%mm4, %%mm1  \n\t"\
756
\
757
    MOVNTQ(%%mm2,  (dst, index, 2))\
758
    MOVNTQ(%%mm1, 8(dst, index, 2))\
759
\
760
    "add             $8, "#index"   \n\t"\
761
    "cmp        "#dstw", "#index"   \n\t"\
762
    " jb             1b             \n\t"
763
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
764

    
765
#define WRITEBGR24OLD(dst, dstw, index) \
766
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
767
    "movq      %%mm2, %%mm1             \n\t" /* B */\
768
    "movq      %%mm5, %%mm6             \n\t" /* R */\
769
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
770
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
771
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
772
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
773
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
774
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
775
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
776
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
777
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
778
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
779
\
780
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
781
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
782
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
783
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
784
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
785
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
786
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
787
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
788
\
789
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
790
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
791
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
792
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
793
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
794
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
795
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
796
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
797
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
798
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
799
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
800
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
801
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
802
\
803
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
804
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
805
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
806
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
807
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
808
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
809
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
810
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
811
\
812
    MOVNTQ(%%mm0,   (dst))\
813
    MOVNTQ(%%mm2,  8(dst))\
814
    MOVNTQ(%%mm3, 16(dst))\
815
    "add         $24, "#dst"            \n\t"\
816
\
817
    "add          $8, "#index"          \n\t"\
818
    "cmp     "#dstw", "#index"          \n\t"\
819
    " jb          1b                    \n\t"
820

    
821
#define WRITEBGR24MMX(dst, dstw, index) \
822
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
823
    "movq      %%mm2, %%mm1     \n\t" /* B */\
824
    "movq      %%mm5, %%mm6     \n\t" /* R */\
825
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
826
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
827
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
828
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
829
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
830
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
831
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
832
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
833
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
834
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
835
\
836
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
837
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
838
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
839
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
840
\
841
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
842
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
843
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
844
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
845
\
846
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
847
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
848
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
849
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
850
\
851
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
852
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
853
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
854
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
855
    MOVNTQ(%%mm0, (dst))\
856
\
857
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
858
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
859
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
860
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
861
    MOVNTQ(%%mm6, 8(dst))\
862
\
863
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
864
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
865
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
866
    MOVNTQ(%%mm5, 16(dst))\
867
\
868
    "add         $24, "#dst"    \n\t"\
869
\
870
    "add          $8, "#index"  \n\t"\
871
    "cmp     "#dstw", "#index"  \n\t"\
872
    " jb          1b            \n\t"
873

    
874
#define WRITEBGR24MMX2(dst, dstw, index) \
875
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
876
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
877
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
878
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
879
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
880
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
881
\
882
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
883
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
884
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
885
\
886
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
887
    "por    %%mm1, %%mm6        \n\t"\
888
    "por    %%mm3, %%mm6        \n\t"\
889
    MOVNTQ(%%mm6, (dst))\
890
\
891
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
892
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
893
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
894
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
895
\
896
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
897
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
898
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
899
\
900
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
901
    "por    %%mm3, %%mm6        \n\t"\
902
    MOVNTQ(%%mm6, 8(dst))\
903
\
904
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
905
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
906
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
907
\
908
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
909
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
910
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
911
\
912
    "por    %%mm1, %%mm3        \n\t"\
913
    "por    %%mm3, %%mm6        \n\t"\
914
    MOVNTQ(%%mm6, 16(dst))\
915
\
916
    "add      $24, "#dst"       \n\t"\
917
\
918
    "add       $8, "#index"     \n\t"\
919
    "cmp  "#dstw", "#index"     \n\t"\
920
    " jb       1b               \n\t"
921

    
922
#ifdef HAVE_MMX2
923
#undef WRITEBGR24
924
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
925
#else
926
#undef WRITEBGR24
927
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
928
#endif
929

    
930
#define REAL_WRITEYUY2(dst, dstw, index) \
931
    "packuswb  %%mm3, %%mm3     \n\t"\
932
    "packuswb  %%mm4, %%mm4     \n\t"\
933
    "packuswb  %%mm7, %%mm1     \n\t"\
934
    "punpcklbw %%mm4, %%mm3     \n\t"\
935
    "movq      %%mm1, %%mm7     \n\t"\
936
    "punpcklbw %%mm3, %%mm1     \n\t"\
937
    "punpckhbw %%mm3, %%mm7     \n\t"\
938
\
939
    MOVNTQ(%%mm1, (dst, index, 2))\
940
    MOVNTQ(%%mm7, 8(dst, index, 2))\
941
\
942
    "add          $8, "#index"  \n\t"\
943
    "cmp     "#dstw", "#index"  \n\t"\
944
    " jb          1b            \n\t"
945
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
946

    
947

    
948
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
949
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
950
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
951
{
952
#ifdef HAVE_MMX
953
    if (c->flags & SWS_ACCURATE_RND){
954
        if (uDest){
955
            YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
956
            YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
957
        }
958

    
959
        YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
960
    }else{
961
        if (uDest){
962
            YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
963
            YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
964
        }
965

    
966
        YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
967
    }
968
#else
969
#ifdef HAVE_ALTIVEC
970
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
971
                      chrFilter, chrSrc, chrFilterSize,
972
                      dest, uDest, vDest, dstW, chrDstW);
973
#else //HAVE_ALTIVEC
974
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
975
            chrFilter, chrSrc, chrFilterSize,
976
            dest, uDest, vDest, dstW, chrDstW);
977
#endif //!HAVE_ALTIVEC
978
#endif /* HAVE_MMX */
979
}
980

    
981
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
982
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
983
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
984
{
985
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
986
             chrFilter, chrSrc, chrFilterSize,
987
             dest, uDest, dstW, chrDstW, dstFormat);
988
}
989

    
990
static inline void RENAME(yuv2yuv1)(SwsContext *c, int16_t *lumSrc, int16_t *chrSrc,
991
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
992
{
993
#ifdef HAVE_MMX
994
    long p= uDest ? 3 : 1;
995
    uint8_t *src[3]= {lumSrc + dstW, chrSrc + chrDstW, chrSrc + VOFW + chrDstW};
996
    uint8_t *dst[3]= {dest, uDest, vDest};
997
    long counter[3] = {dstW, chrDstW, chrDstW};
998

    
999
    if (c->flags & SWS_ACCURATE_RND){
1000
        while(p--){
1001
            asm volatile(
1002
                YSCALEYUV2YV121_ACCURATE
1003
                :: "r" (src[p]), "r" (dst[p] + counter[p]),
1004
                "g" (-counter[p])
1005
                : "%"REG_a
1006
            );
1007
        }
1008
    }else{
1009
    while(p--){
1010
        asm volatile(
1011
            YSCALEYUV2YV121
1012
            :: "r" (src[p]), "r" (dst[p] + counter[p]),
1013
            "g" (-counter[p])
1014
            : "%"REG_a
1015
        );
1016
    }
1017
    }
1018

    
1019
#else
1020
    int i;
1021
    for (i=0; i<dstW; i++)
1022
    {
1023
        int val= lumSrc[i]>>7;
1024

    
1025
        if (val&256){
1026
            if (val<0) val=0;
1027
            else       val=255;
1028
        }
1029

    
1030
        dest[i]= val;
1031
    }
1032

    
1033
    if (uDest)
1034
        for (i=0; i<chrDstW; i++)
1035
        {
1036
            int u=chrSrc[i]>>7;
1037
            int v=chrSrc[i + VOFW]>>7;
1038

    
1039
            if ((u|v)&256){
1040
                if (u<0)        u=0;
1041
                else if (u>255) u=255;
1042
                if (v<0)        v=0;
1043
                else if (v>255) v=255;
1044
            }
1045

    
1046
            uDest[i]= u;
1047
            vDest[i]= v;
1048
        }
1049
#endif
1050
}
1051

    
1052

    
1053
/**
1054
 * vertical scale YV12 to RGB
1055
 */
1056
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1057
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1058
                                       uint8_t *dest, long dstW, long dstY)
1059
{
1060
#ifdef HAVE_MMX
1061
    long dummy=0;
1062
    if (c->flags & SWS_ACCURATE_RND){
1063
        switch(c->dstFormat){
1064
        case PIX_FMT_RGB32:
1065
            YSCALEYUV2PACKEDX_ACCURATE
1066
            YSCALEYUV2RGBX
1067
            WRITEBGR32(%4, %5, %%REGa)
1068

    
1069
            YSCALEYUV2PACKEDX_END
1070
            return;
1071
        case PIX_FMT_BGR24:
1072
            YSCALEYUV2PACKEDX_ACCURATE
1073
            YSCALEYUV2RGBX
1074
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1075
            "add %4, %%"REG_c"                        \n\t"
1076
            WRITEBGR24(%%REGc, %5, %%REGa)
1077

    
1078

    
1079
            :: "r" (&c->redDither),
1080
               "m" (dummy), "m" (dummy), "m" (dummy),
1081
               "r" (dest), "m" (dstW)
1082
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1083
            );
1084
            return;
1085
        case PIX_FMT_BGR555:
1086
            YSCALEYUV2PACKEDX_ACCURATE
1087
            YSCALEYUV2RGBX
1088
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1089
#ifdef DITHER1XBPP
1090
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1091
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1092
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1093
#endif
1094

    
1095
            WRITEBGR15(%4, %5, %%REGa)
1096
            YSCALEYUV2PACKEDX_END
1097
            return;
1098
        case PIX_FMT_BGR565:
1099
            YSCALEYUV2PACKEDX_ACCURATE
1100
            YSCALEYUV2RGBX
1101
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1102
#ifdef DITHER1XBPP
1103
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1104
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1105
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1106
#endif
1107

    
1108
            WRITEBGR16(%4, %5, %%REGa)
1109
            YSCALEYUV2PACKEDX_END
1110
            return;
1111
        case PIX_FMT_YUYV422:
1112
            YSCALEYUV2PACKEDX_ACCURATE
1113
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1114

    
1115
            "psraw $3, %%mm3    \n\t"
1116
            "psraw $3, %%mm4    \n\t"
1117
            "psraw $3, %%mm1    \n\t"
1118
            "psraw $3, %%mm7    \n\t"
1119
            WRITEYUY2(%4, %5, %%REGa)
1120
            YSCALEYUV2PACKEDX_END
1121
            return;
1122
    }
1123
    }else{
1124
        switch(c->dstFormat)
1125
        {
1126
        case PIX_FMT_RGB32:
1127
            YSCALEYUV2PACKEDX
1128
            YSCALEYUV2RGBX
1129
            WRITEBGR32(%4, %5, %%REGa)
1130
            YSCALEYUV2PACKEDX_END
1131
            return;
1132
        case PIX_FMT_BGR24:
1133
            YSCALEYUV2PACKEDX
1134
            YSCALEYUV2RGBX
1135
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1136
            "add                        %4, %%"REG_c"   \n\t"
1137
            WRITEBGR24(%%REGc, %5, %%REGa)
1138

    
1139
            :: "r" (&c->redDither),
1140
               "m" (dummy), "m" (dummy), "m" (dummy),
1141
               "r" (dest),  "m" (dstW)
1142
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1143
            );
1144
            return;
1145
        case PIX_FMT_BGR555:
1146
            YSCALEYUV2PACKEDX
1147
            YSCALEYUV2RGBX
1148
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1149
#ifdef DITHER1XBPP
1150
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1151
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1152
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1153
#endif
1154

    
1155
            WRITEBGR15(%4, %5, %%REGa)
1156
            YSCALEYUV2PACKEDX_END
1157
            return;
1158
        case PIX_FMT_BGR565:
1159
            YSCALEYUV2PACKEDX
1160
            YSCALEYUV2RGBX
1161
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1162
#ifdef DITHER1XBPP
1163
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1164
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1165
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1166
#endif
1167

    
1168
            WRITEBGR16(%4, %5, %%REGa)
1169
            YSCALEYUV2PACKEDX_END
1170
            return;
1171
        case PIX_FMT_YUYV422:
1172
            YSCALEYUV2PACKEDX
1173
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1174

    
1175
            "psraw $3, %%mm3    \n\t"
1176
            "psraw $3, %%mm4    \n\t"
1177
            "psraw $3, %%mm1    \n\t"
1178
            "psraw $3, %%mm7    \n\t"
1179
            WRITEYUY2(%4, %5, %%REGa)
1180
            YSCALEYUV2PACKEDX_END
1181
            return;
1182
        }
1183
    }
1184
#endif /* HAVE_MMX */
1185
#ifdef HAVE_ALTIVEC
1186
    /* The following list of supported dstFormat values should
1187
       match what's found in the body of altivec_yuv2packedX() */
1188
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1189
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1190
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1191
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1192
                                 chrFilter, chrSrc, chrFilterSize,
1193
                                 dest, dstW, dstY);
1194
    else
1195
#endif
1196
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1197
                       chrFilter, chrSrc, chrFilterSize,
1198
                       dest, dstW, dstY);
1199
}
1200

    
1201
/**
1202
 * vertical bilinear scale YV12 to RGB
1203
 */
1204
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1205
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1206
{
1207
    int yalpha1=yalpha^4095;
1208
    int uvalpha1=uvalpha^4095;
1209
    int i;
1210

    
1211
#if 0 //isn't used
1212
    if (flags&SWS_FULL_CHR_H_INT)
1213
    {
1214
        switch(dstFormat)
1215
        {
1216
#ifdef HAVE_MMX
1217
        case PIX_FMT_RGB32:
1218
            asm volatile(
1219

1220

1221
FULL_YSCALEYUV2RGB
1222
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1223
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1224

1225
            "movq      %%mm3, %%mm1    \n\t"
1226
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1227
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1228

1229
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1230
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1231

1232
            "add $4, %%"REG_a"  \n\t"
1233
            "cmp %5, %%"REG_a"  \n\t"
1234
            " jb 1b             \n\t"
1235

1236
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1237
            "m" (yalpha1), "m" (uvalpha1)
1238
            : "%"REG_a
1239
            );
1240
            break;
1241
        case PIX_FMT_BGR24:
1242
            asm volatile(
1243

1244
FULL_YSCALEYUV2RGB
1245

1246
                                              // lsb ... msb
1247
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1248
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1249

1250
            "movq      %%mm3, %%mm1     \n\t"
1251
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1252
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1253

1254
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1255
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1256
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1257
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1258
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1259
            "movq      %%mm1, %%mm2     \n\t"
1260
            "psllq       $48, %%mm1     \n\t" // 000000BG
1261
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1262

1263
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1264
            "psrld       $16, %%mm2     \n\t" // R000R000
1265
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1266
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1267

1268
            "mov          %4, %%"REG_b" \n\t"
1269
            "add   %%"REG_a", %%"REG_b" \n\t"
1270

1271
#ifdef HAVE_MMX2
1272
            //FIXME Alignment
1273
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1274
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1275
#else
1276
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1277
            "psrlq  $32, %%mm3                          \n\t"
1278
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1279
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1280
#endif
1281
            "add     $4, %%"REG_a"                      \n\t"
1282
            "cmp     %5, %%"REG_a"                      \n\t"
1283
            " jb     1b                                 \n\t"
1284

    
1285
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1286
            "m" (yalpha1), "m" (uvalpha1)
1287
            : "%"REG_a, "%"REG_b
1288
            );
1289
            break;
1290
        case PIX_FMT_BGR555:
1291
            asm volatile(
1292

    
1293
FULL_YSCALEYUV2RGB
1294
#ifdef DITHER1XBPP
1295
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1296
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1297
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1298
#endif
1299
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1300
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1301
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1302

    
1303
            "psrlw                   $3, %%mm3  \n\t"
1304
            "psllw                   $2, %%mm1  \n\t"
1305
            "psllw                   $7, %%mm0  \n\t"
1306
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1307
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1308

    
1309
            "por                  %%mm3, %%mm1  \n\t"
1310
            "por                  %%mm1, %%mm0  \n\t"
1311

    
1312
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1313

    
1314
            "add $4, %%"REG_a"  \n\t"
1315
            "cmp %5, %%"REG_a"  \n\t"
1316
            " jb 1b             \n\t"
1317

    
1318
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1319
            "m" (yalpha1), "m" (uvalpha1)
1320
            : "%"REG_a
1321
            );
1322
            break;
1323
        case PIX_FMT_BGR565:
1324
            asm volatile(
1325

    
1326
FULL_YSCALEYUV2RGB
1327
#ifdef DITHER1XBPP
1328
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1329
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1330
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1331
#endif
1332
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1333
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1334
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1335

    
1336
            "psrlw                   $3, %%mm3  \n\t"
1337
            "psllw                   $3, %%mm1  \n\t"
1338
            "psllw                   $8, %%mm0  \n\t"
1339
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1340
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1341

    
1342
            "por                  %%mm3, %%mm1  \n\t"
1343
            "por                  %%mm1, %%mm0  \n\t"
1344

    
1345
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1346

    
1347
            "add $4, %%"REG_a"  \n\t"
1348
            "cmp %5, %%"REG_a"  \n\t"
1349
            " jb 1b             \n\t"
1350

    
1351
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1352
            "m" (yalpha1), "m" (uvalpha1)
1353
            : "%"REG_a
1354
            );
1355
            break;
1356
#endif /* HAVE_MMX */
1357
        case PIX_FMT_BGR32:
1358
#ifndef HAVE_MMX
1359
        case PIX_FMT_RGB32:
1360
#endif
1361
            if (dstFormat==PIX_FMT_RGB32)
1362
            {
1363
                int i;
1364
#ifdef WORDS_BIGENDIAN
1365
                dest++;
1366
#endif
1367
                for (i=0;i<dstW;i++){
1368
                    // vertical linear interpolation && yuv2rgb in a single step:
1369
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1370
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1371
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1372
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1373
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1374
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1375
                    dest+= 4;
1376
                }
1377
            }
1378
            else if (dstFormat==PIX_FMT_BGR24)
1379
            {
1380
                int i;
1381
                for (i=0;i<dstW;i++){
1382
                    // vertical linear interpolation && yuv2rgb in a single step:
1383
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1384
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1385
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1386
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1387
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1388
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1389
                    dest+= 3;
1390
                }
1391
            }
1392
            else if (dstFormat==PIX_FMT_BGR565)
1393
            {
1394
                int i;
1395
                for (i=0;i<dstW;i++){
1396
                    // vertical linear interpolation && yuv2rgb in a single step:
1397
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1398
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1399
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1400

    
1401
                    ((uint16_t*)dest)[i] =
1402
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1403
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1404
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1405
                }
1406
            }
1407
            else if (dstFormat==PIX_FMT_BGR555)
1408
            {
1409
                int i;
1410
                for (i=0;i<dstW;i++){
1411
                    // vertical linear interpolation && yuv2rgb in a single step:
1412
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1413
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1414
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1415

    
1416
                    ((uint16_t*)dest)[i] =
1417
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1418
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1419
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1420
                }
1421
            }
1422
        }//FULL_UV_IPOL
1423
    else
1424
    {
1425
#endif // if 0
1426
#ifdef HAVE_MMX
1427
        switch(c->dstFormat)
1428
        {
1429
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1430
            case PIX_FMT_RGB32:
1431
                asm volatile(
1432
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1433
                "mov        %4, %%"REG_b"               \n\t"
1434
                "push %%"REG_BP"                        \n\t"
1435
                YSCALEYUV2RGB(%%REGBP, %5)
1436
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1437
                "pop %%"REG_BP"                         \n\t"
1438
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1439

    
1440
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1441
                "a" (&c->redDither)
1442
                );
1443
                return;
1444
            case PIX_FMT_BGR24:
1445
                asm volatile(
1446
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1447
                "mov        %4, %%"REG_b"               \n\t"
1448
                "push %%"REG_BP"                        \n\t"
1449
                YSCALEYUV2RGB(%%REGBP, %5)
1450
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1451
                "pop %%"REG_BP"                         \n\t"
1452
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1453
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1454
                "a" (&c->redDither)
1455
                );
1456
                return;
1457
            case PIX_FMT_BGR555:
1458
                asm volatile(
1459
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1460
                "mov        %4, %%"REG_b"               \n\t"
1461
                "push %%"REG_BP"                        \n\t"
1462
                YSCALEYUV2RGB(%%REGBP, %5)
1463
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1464
#ifdef DITHER1XBPP
1465
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1466
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1467
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1468
#endif
1469

    
1470
                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1471
                "pop %%"REG_BP"                         \n\t"
1472
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1473

    
1474
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                "a" (&c->redDither)
1476
                );
1477
                return;
1478
            case PIX_FMT_BGR565:
1479
                asm volatile(
1480
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1481
                "mov        %4, %%"REG_b"               \n\t"
1482
                "push %%"REG_BP"                        \n\t"
1483
                YSCALEYUV2RGB(%%REGBP, %5)
1484
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1485
#ifdef DITHER1XBPP
1486
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1487
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1488
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1489
#endif
1490

    
1491
                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1492
                "pop %%"REG_BP"                         \n\t"
1493
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1494
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1495
                "a" (&c->redDither)
1496
                );
1497
                return;
1498
            case PIX_FMT_YUYV422:
1499
                asm volatile(
1500
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1501
                "mov %4, %%"REG_b"                        \n\t"
1502
                "push %%"REG_BP"                        \n\t"
1503
                YSCALEYUV2PACKED(%%REGBP, %5)
1504
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1505
                "pop %%"REG_BP"                         \n\t"
1506
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1507
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1508
                "a" (&c->redDither)
1509
                );
1510
                return;
1511
            default: break;
1512
        }
1513
#endif //HAVE_MMX
1514
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1515
}
1516

    
1517
/**
1518
 * YV12 to RGB without scaling or interpolating
1519
 */
1520
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1521
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1522
{
1523
    const int yalpha1=0;
1524
    int i;
1525

    
1526
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1527
    const int yalpha= 4096; //FIXME ...
1528

    
1529
    if (flags&SWS_FULL_CHR_H_INT)
1530
    {
1531
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1532
        return;
1533
    }
1534

    
1535
#ifdef HAVE_MMX
1536
    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1537
    {
1538
        switch(dstFormat)
1539
        {
1540
        case PIX_FMT_RGB32:
1541
            asm volatile(
1542
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1543
            "mov        %4, %%"REG_b"               \n\t"
1544
            "push %%"REG_BP"                        \n\t"
1545
            YSCALEYUV2RGB1(%%REGBP, %5)
1546
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1547
            "pop %%"REG_BP"                         \n\t"
1548
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1549

    
1550
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1551
            "a" (&c->redDither)
1552
            );
1553
            return;
1554
        case PIX_FMT_BGR24:
1555
            asm volatile(
1556
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1557
            "mov        %4, %%"REG_b"               \n\t"
1558
            "push %%"REG_BP"                        \n\t"
1559
            YSCALEYUV2RGB1(%%REGBP, %5)
1560
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1561
            "pop %%"REG_BP"                         \n\t"
1562
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1563

    
1564
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565
            "a" (&c->redDither)
1566
            );
1567
            return;
1568
        case PIX_FMT_BGR555:
1569
            asm volatile(
1570
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1571
            "mov        %4, %%"REG_b"               \n\t"
1572
            "push %%"REG_BP"                        \n\t"
1573
            YSCALEYUV2RGB1(%%REGBP, %5)
1574
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575
#ifdef DITHER1XBPP
1576
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1577
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1578
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1579
#endif
1580
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1581
            "pop %%"REG_BP"                         \n\t"
1582
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1583

    
1584
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1585
            "a" (&c->redDither)
1586
            );
1587
            return;
1588
        case PIX_FMT_BGR565:
1589
            asm volatile(
1590
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1591
            "mov        %4, %%"REG_b"               \n\t"
1592
            "push %%"REG_BP"                        \n\t"
1593
            YSCALEYUV2RGB1(%%REGBP, %5)
1594
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1595
#ifdef DITHER1XBPP
1596
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1597
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1598
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1599
#endif
1600

    
1601
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1602
            "pop %%"REG_BP"                         \n\t"
1603
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1604

    
1605
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1606
            "a" (&c->redDither)
1607
            );
1608
            return;
1609
        case PIX_FMT_YUYV422:
1610
            asm volatile(
1611
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1612
            "mov        %4, %%"REG_b"               \n\t"
1613
            "push %%"REG_BP"                        \n\t"
1614
            YSCALEYUV2PACKED1(%%REGBP, %5)
1615
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1616
            "pop %%"REG_BP"                         \n\t"
1617
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1618

    
1619
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620
            "a" (&c->redDither)
1621
            );
1622
            return;
1623
        }
1624
    }
1625
    else
1626
    {
1627
        switch(dstFormat)
1628
        {
1629
        case PIX_FMT_RGB32:
1630
            asm volatile(
1631
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1632
            "mov        %4, %%"REG_b"               \n\t"
1633
            "push %%"REG_BP"                        \n\t"
1634
            YSCALEYUV2RGB1b(%%REGBP, %5)
1635
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1636
            "pop %%"REG_BP"                         \n\t"
1637
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1638

    
1639
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1640
            "a" (&c->redDither)
1641
            );
1642
            return;
1643
        case PIX_FMT_BGR24:
1644
            asm volatile(
1645
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1646
            "mov        %4, %%"REG_b"               \n\t"
1647
            "push %%"REG_BP"                        \n\t"
1648
            YSCALEYUV2RGB1b(%%REGBP, %5)
1649
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1650
            "pop %%"REG_BP"                         \n\t"
1651
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1652

    
1653
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654
            "a" (&c->redDither)
1655
            );
1656
            return;
1657
        case PIX_FMT_BGR555:
1658
            asm volatile(
1659
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1660
            "mov        %4, %%"REG_b"               \n\t"
1661
            "push %%"REG_BP"                        \n\t"
1662
            YSCALEYUV2RGB1b(%%REGBP, %5)
1663
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664
#ifdef DITHER1XBPP
1665
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1666
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1667
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1668
#endif
1669
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1670
            "pop %%"REG_BP"                         \n\t"
1671
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1672

    
1673
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1674
            "a" (&c->redDither)
1675
            );
1676
            return;
1677
        case PIX_FMT_BGR565:
1678
            asm volatile(
1679
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1680
            "mov        %4, %%"REG_b"               \n\t"
1681
            "push %%"REG_BP"                        \n\t"
1682
            YSCALEYUV2RGB1b(%%REGBP, %5)
1683
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1684
#ifdef DITHER1XBPP
1685
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1686
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1687
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1688
#endif
1689

    
1690
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1691
            "pop %%"REG_BP"                         \n\t"
1692
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1693

    
1694
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1695
            "a" (&c->redDither)
1696
            );
1697
            return;
1698
        case PIX_FMT_YUYV422:
1699
            asm volatile(
1700
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1701
            "mov        %4, %%"REG_b"               \n\t"
1702
            "push %%"REG_BP"                        \n\t"
1703
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1704
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1705
            "pop %%"REG_BP"                         \n\t"
1706
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1707

    
1708
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1709
            "a" (&c->redDither)
1710
            );
1711
            return;
1712
        }
1713
    }
1714
#endif /* HAVE_MMX */
1715
    if (uvalpha < 2048)
1716
    {
1717
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1718
    }else{
1719
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1720
    }
1721
}
1722

    
1723
//FIXME yuy2* can read up to 7 samples too much
1724

    
1725
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1726
{
1727
#ifdef HAVE_MMX
1728
    asm volatile(
1729
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1730
    "mov                    %0, %%"REG_a"       \n\t"
1731
    "1:                                         \n\t"
1732
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1733
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1734
    "pand                %%mm2, %%mm0           \n\t"
1735
    "pand                %%mm2, %%mm1           \n\t"
1736
    "packuswb            %%mm1, %%mm0           \n\t"
1737
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1738
    "add                    $8, %%"REG_a"       \n\t"
1739
    " js                    1b                  \n\t"
1740
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1741
    : "%"REG_a
1742
    );
1743
#else
1744
    int i;
1745
    for (i=0; i<width; i++)
1746
        dst[i]= src[2*i];
1747
#endif
1748
}
1749

    
1750
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1751
{
1752
#ifdef HAVE_MMX
1753
    asm volatile(
1754
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1755
    "mov                    %0, %%"REG_a"       \n\t"
1756
    "1:                                         \n\t"
1757
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1758
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1759
    "psrlw                  $8, %%mm0           \n\t"
1760
    "psrlw                  $8, %%mm1           \n\t"
1761
    "packuswb            %%mm1, %%mm0           \n\t"
1762
    "movq                %%mm0, %%mm1           \n\t"
1763
    "psrlw                  $8, %%mm0           \n\t"
1764
    "pand                %%mm4, %%mm1           \n\t"
1765
    "packuswb            %%mm0, %%mm0           \n\t"
1766
    "packuswb            %%mm1, %%mm1           \n\t"
1767
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1768
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1769
    "add                    $4, %%"REG_a"       \n\t"
1770
    " js                    1b                  \n\t"
1771
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1772
    : "%"REG_a
1773
    );
1774
#else
1775
    int i;
1776
    for (i=0; i<width; i++)
1777
    {
1778
        dstU[i]= src1[4*i + 1];
1779
        dstV[i]= src1[4*i + 3];
1780
    }
1781
#endif
1782
    assert(src1 == src2);
1783
}
1784

    
1785
/* This is almost identical to the previous, end exists only because
1786
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1787
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1788
{
1789
#ifdef HAVE_MMX
1790
    asm volatile(
1791
    "mov                  %0, %%"REG_a"         \n\t"
1792
    "1:                                         \n\t"
1793
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1794
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1795
    "psrlw                $8, %%mm0             \n\t"
1796
    "psrlw                $8, %%mm1             \n\t"
1797
    "packuswb          %%mm1, %%mm0             \n\t"
1798
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1799
    "add                  $8, %%"REG_a"         \n\t"
1800
    " js                  1b                    \n\t"
1801
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1802
    : "%"REG_a
1803
    );
1804
#else
1805
    int i;
1806
    for (i=0; i<width; i++)
1807
        dst[i]= src[2*i+1];
1808
#endif
1809
}
1810

    
1811
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1812
{
1813
#ifdef HAVE_MMX
1814
    asm volatile(
1815
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1816
    "mov                    %0, %%"REG_a"       \n\t"
1817
    "1:                                         \n\t"
1818
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1819
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1820
    "pand                %%mm4, %%mm0           \n\t"
1821
    "pand                %%mm4, %%mm1           \n\t"
1822
    "packuswb            %%mm1, %%mm0           \n\t"
1823
    "movq                %%mm0, %%mm1           \n\t"
1824
    "psrlw                  $8, %%mm0           \n\t"
1825
    "pand                %%mm4, %%mm1           \n\t"
1826
    "packuswb            %%mm0, %%mm0           \n\t"
1827
    "packuswb            %%mm1, %%mm1           \n\t"
1828
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1829
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1830
    "add                    $4, %%"REG_a"       \n\t"
1831
    " js                    1b                  \n\t"
1832
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1833
    : "%"REG_a
1834
    );
1835
#else
1836
    int i;
1837
    for (i=0; i<width; i++)
1838
    {
1839
        dstU[i]= src1[4*i + 0];
1840
        dstV[i]= src1[4*i + 2];
1841
    }
1842
#endif
1843
    assert(src1 == src2);
1844
}
1845

    
1846
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1847
{
1848
    int i;
1849
    for (i=0; i<width; i++)
1850
    {
1851
        int b=  ((uint32_t*)src)[i]&0xFF;
1852
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
1853
        int r= (((uint32_t*)src)[i]>>16)&0xFF;
1854

    
1855
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1856
    }
1857
}
1858

    
1859
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1860
{
1861
    int i;
1862
    assert(src1 == src2);
1863
    for (i=0; i<width; i++)
1864
    {
1865
        const int a= ((uint32_t*)src1)[2*i+0];
1866
        const int e= ((uint32_t*)src1)[2*i+1];
1867
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
1868
        const int h= (a&0x00FF00) + (e&0x00FF00);
1869
        const int b=  l&0x3FF;
1870
        const int g=  h>>8;
1871
        const int r=  l>>16;
1872

    
1873
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1874
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1875
    }
1876
}
1877

    
1878
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1879
{
1880
#ifdef HAVE_MMX
1881
    asm volatile(
1882
    "mov                        %2, %%"REG_a"   \n\t"
1883
    "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
1884
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1885
    "pxor                    %%mm7, %%mm7       \n\t"
1886
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1887
    ASMALIGN(4)
1888
    "1:                                         \n\t"
1889
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1890
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1891
    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1892
    "punpcklbw               %%mm7, %%mm0       \n\t"
1893
    "punpcklbw               %%mm7, %%mm1       \n\t"
1894
    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1895
    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1896
    "punpcklbw               %%mm7, %%mm2       \n\t"
1897
    "punpcklbw               %%mm7, %%mm3       \n\t"
1898
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1899
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1900
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1901
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1902
#ifndef FAST_BGR2YV12
1903
    "psrad                      $8, %%mm0       \n\t"
1904
    "psrad                      $8, %%mm1       \n\t"
1905
    "psrad                      $8, %%mm2       \n\t"
1906
    "psrad                      $8, %%mm3       \n\t"
1907
#endif
1908
    "packssdw                %%mm1, %%mm0       \n\t"
1909
    "packssdw                %%mm3, %%mm2       \n\t"
1910
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1911
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1912
    "packssdw                %%mm2, %%mm0       \n\t"
1913
    "psraw                      $7, %%mm0       \n\t"
1914

    
1915
    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1916
    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1917
    "punpcklbw               %%mm7, %%mm4       \n\t"
1918
    "punpcklbw               %%mm7, %%mm1       \n\t"
1919
    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1920
    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1921
    "punpcklbw               %%mm7, %%mm2       \n\t"
1922
    "punpcklbw               %%mm7, %%mm3       \n\t"
1923
    "pmaddwd                 %%mm6, %%mm4       \n\t"
1924
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1925
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1926
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1927
#ifndef FAST_BGR2YV12
1928
    "psrad                      $8, %%mm4       \n\t"
1929
    "psrad                      $8, %%mm1       \n\t"
1930
    "psrad                      $8, %%mm2       \n\t"
1931
    "psrad                      $8, %%mm3       \n\t"
1932
#endif
1933
    "packssdw                %%mm1, %%mm4       \n\t"
1934
    "packssdw                %%mm3, %%mm2       \n\t"
1935
    "pmaddwd                 %%mm5, %%mm4       \n\t"
1936
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1937
    "add                       $24, %%"REG_d"   \n\t"
1938
    "packssdw                %%mm2, %%mm4       \n\t"
1939
    "psraw                      $7, %%mm4       \n\t"
1940

    
1941
    "packuswb                %%mm4, %%mm0       \n\t"
1942
    "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1943

    
1944
    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
1945
    "add                        $8, %%"REG_a"   \n\t"
1946
    " js                        1b              \n\t"
1947
    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1948
    : "%"REG_a, "%"REG_d
1949
    );
1950
#else
1951
    int i;
1952
    for (i=0; i<width; i++)
1953
    {
1954
        int b= src[i*3+0];
1955
        int g= src[i*3+1];
1956
        int r= src[i*3+2];
1957

    
1958
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1959
    }
1960
#endif /* HAVE_MMX */
1961
}
1962

    
1963
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1964
{
1965
#ifdef HAVE_MMX
1966
    asm volatile(
1967
    "mov                        %3, %%"REG_a"   \n\t"
1968
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1969
    "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
1970
    "pxor                    %%mm7, %%mm7       \n\t"
1971
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1972
    "add                 %%"REG_d", %%"REG_d"   \n\t"
1973
    ASMALIGN(4)
1974
    "1:                                         \n\t"
1975
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1976
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1977
    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1978
    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1979
    "movq                    %%mm0, %%mm1       \n\t"
1980
    "movq                    %%mm2, %%mm3       \n\t"
1981
    "psrlq                     $24, %%mm0       \n\t"
1982
    "psrlq                     $24, %%mm2       \n\t"
1983
    PAVGB(%%mm1, %%mm0)
1984
    PAVGB(%%mm3, %%mm2)
1985
    "punpcklbw               %%mm7, %%mm0       \n\t"
1986
    "punpcklbw               %%mm7, %%mm2       \n\t"
1987
#else
1988
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1989
    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1990
    "punpcklbw               %%mm7, %%mm0       \n\t"
1991
    "punpcklbw               %%mm7, %%mm2       \n\t"
1992
    "paddw                   %%mm2, %%mm0       \n\t"
1993
    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1994
    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1995
    "punpcklbw               %%mm7, %%mm4       \n\t"
1996
    "punpcklbw               %%mm7, %%mm2       \n\t"
1997
    "paddw                   %%mm4, %%mm2       \n\t"
1998
    "psrlw                      $1, %%mm0       \n\t"
1999
    "psrlw                      $1, %%mm2       \n\t"
2000
#endif
2001
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2002
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2003

    
2004
    "pmaddwd                 %%mm0, %%mm1       \n\t"
2005
    "pmaddwd                 %%mm2, %%mm3       \n\t"
2006
    "pmaddwd                 %%mm6, %%mm0       \n\t"
2007
    "pmaddwd                 %%mm6, %%mm2       \n\t"
2008
#ifndef FAST_BGR2YV12
2009
    "psrad                      $8, %%mm0       \n\t"
2010
    "psrad                      $8, %%mm1       \n\t"
2011
    "psrad                      $8, %%mm2       \n\t"
2012
    "psrad                      $8, %%mm3       \n\t"
2013
#endif
2014
    "packssdw                %%mm2, %%mm0       \n\t"
2015
    "packssdw                %%mm3, %%mm1       \n\t"
2016
    "pmaddwd                 %%mm5, %%mm0       \n\t"
2017
    "pmaddwd                 %%mm5, %%mm1       \n\t"
2018
    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
2019
    "psraw                      $7, %%mm0       \n\t"
2020

    
2021
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2022
    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
2023
    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
2024
    "movq                   %%mm4, %%mm1       \n\t"
2025
    "movq                   %%mm2, %%mm3       \n\t"
2026
    "psrlq                    $24, %%mm4       \n\t"
2027
    "psrlq                    $24, %%mm2       \n\t"
2028
    PAVGB(%%mm1, %%mm4)
2029
    PAVGB(%%mm3, %%mm2)
2030
    "punpcklbw              %%mm7, %%mm4       \n\t"
2031
    "punpcklbw              %%mm7, %%mm2       \n\t"
2032
#else
2033
    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
2034
    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
2035
    "punpcklbw              %%mm7, %%mm4       \n\t"
2036
    "punpcklbw              %%mm7, %%mm2       \n\t"
2037
    "paddw                  %%mm2, %%mm4       \n\t"
2038
    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
2039
    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
2040
    "punpcklbw              %%mm7, %%mm5       \n\t"
2041
    "punpcklbw              %%mm7, %%mm2       \n\t"
2042
    "paddw                  %%mm5, %%mm2       \n\t"
2043
    "movq      "MANGLE(ff_w1111)", %%mm5       \n\t"
2044
    "psrlw                     $2, %%mm4       \n\t"
2045
    "psrlw                     $2, %%mm2       \n\t"
2046
#endif
2047
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2048
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2049

    
2050
    "pmaddwd                %%mm4, %%mm1       \n\t"
2051
    "pmaddwd                %%mm2, %%mm3       \n\t"
2052
    "pmaddwd                %%mm6, %%mm4       \n\t"
2053
    "pmaddwd                %%mm6, %%mm2       \n\t"
2054
#ifndef FAST_BGR2YV12
2055
    "psrad                     $8, %%mm4       \n\t"
2056
    "psrad                     $8, %%mm1       \n\t"
2057
    "psrad                     $8, %%mm2       \n\t"
2058
    "psrad                     $8, %%mm3       \n\t"
2059
#endif
2060
    "packssdw               %%mm2, %%mm4       \n\t"
2061
    "packssdw               %%mm3, %%mm1       \n\t"
2062
    "pmaddwd                %%mm5, %%mm4       \n\t"
2063
    "pmaddwd                %%mm5, %%mm1       \n\t"
2064
    "add                      $24, %%"REG_d"   \n\t"
2065
    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2066
    "psraw                     $7, %%mm4       \n\t"
2067

    
2068
    "movq                   %%mm0, %%mm1       \n\t"
2069
    "punpckldq              %%mm4, %%mm0       \n\t"
2070
    "punpckhdq              %%mm4, %%mm1       \n\t"
2071
    "packsswb               %%mm1, %%mm0       \n\t"
2072
    "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0    \n\t"
2073

    
2074
    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
2075
    "punpckhdq              %%mm0, %%mm0            \n\t"
2076
    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
2077
    "add                       $4, %%"REG_a"        \n\t"
2078
    " js                       1b                   \n\t"
2079
    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2080
    : "%"REG_a, "%"REG_d
2081
    );
2082
#else
2083
    int i;
2084
    for (i=0; i<width; i++)
2085
    {
2086
        int b= src1[6*i + 0] + src1[6*i + 3];
2087
        int g= src1[6*i + 1] + src1[6*i + 4];
2088
        int r= src1[6*i + 2] + src1[6*i + 5];
2089

    
2090
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2091
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2092
    }
2093
#endif /* HAVE_MMX */
2094
    assert(src1 == src2);
2095
}
2096

    
2097
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2098
{
2099
    int i;
2100
    for (i=0; i<width; i++)
2101
    {
2102
        int d= ((uint16_t*)src)[i];
2103
        int b= d&0x1F;
2104
        int g= (d>>5)&0x3F;
2105
        int r= (d>>11)&0x1F;
2106

    
2107
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2108
    }
2109
}
2110

    
2111
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2112
{
2113
    int i;
2114
    assert(src1==src2);
2115
    for (i=0; i<width; i++)
2116
    {
2117
        int d0= ((uint32_t*)src1)[i];
2118

    
2119
        int dl= (d0&0x07E0F81F);
2120
        int dh= ((d0>>5)&0x07C0F83F);
2121

    
2122
        int dh2= (dh>>11) + (dh<<21);
2123
        int d= dh2 + dl;
2124

    
2125
        int b= d&0x7F;
2126
        int r= (d>>11)&0x7F;
2127
        int g= d>>21;
2128
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2129
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2130
    }
2131
}
2132

    
2133
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2134
{
2135
    int i;
2136
    for (i=0; i<width; i++)
2137
    {
2138
        int d= ((uint16_t*)src)[i];
2139
        int b= d&0x1F;
2140
        int g= (d>>5)&0x1F;
2141
        int r= (d>>10)&0x1F;
2142

    
2143
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2144
    }
2145
}
2146

    
2147
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2148
{
2149
    int i;
2150
    assert(src1==src2);
2151
    for (i=0; i<width; i++)
2152
    {
2153
        int d0= ((uint32_t*)src1)[i];
2154

    
2155
        int dl= (d0&0x03E07C1F);
2156
        int dh= ((d0>>5)&0x03E0F81F);
2157

    
2158
        int dh2= (dh>>11) + (dh<<21);
2159
        int d= dh2 + dl;
2160

    
2161
        int b= d&0x7F;
2162
        int r= (d>>10)&0x7F;
2163
        int g= d>>21;
2164
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2165
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2166
    }
2167
}
2168

    
2169

    
2170
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2171
{
2172
    int i;
2173
    for (i=0; i<width; i++)
2174
    {
2175
        int r=  ((uint32_t*)src)[i]&0xFF;
2176
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
2177
        int b= (((uint32_t*)src)[i]>>16)&0xFF;
2178

    
2179
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2180
    }
2181
}
2182

    
2183
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2184
{
2185
    int i;
2186
    assert(src1==src2);
2187
    for (i=0; i<width; i++)
2188
    {
2189
        const int a= ((uint32_t*)src1)[2*i+0];
2190
        const int e= ((uint32_t*)src1)[2*i+1];
2191
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
2192
        const int h= (a&0x00FF00) + (e&0x00FF00);
2193
        const int r=  l&0x3FF;
2194
        const int g=  h>>8;
2195
        const int b=  l>>16;
2196

    
2197
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2198
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2199
    }
2200
}
2201

    
2202
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2203
{
2204
    int i;
2205
    for (i=0; i<width; i++)
2206
    {
2207
        int r= src[i*3+0];
2208
        int g= src[i*3+1];
2209
        int b= src[i*3+2];
2210

    
2211
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2212
    }
2213
}
2214

    
2215
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2216
{
2217
    int i;
2218
    assert(src1==src2);
2219
    for (i=0; i<width; i++)
2220
    {
2221
        int r= src1[6*i + 0] + src1[6*i + 3];
2222
        int g= src1[6*i + 1] + src1[6*i + 4];
2223
        int b= src1[6*i + 2] + src1[6*i + 5];
2224

    
2225
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2226
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2227
    }
2228
}
2229

    
2230
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2231
{
2232
    int i;
2233
    for (i=0; i<width; i++)
2234
    {
2235
        int d= ((uint16_t*)src)[i];
2236
        int r= d&0x1F;
2237
        int g= (d>>5)&0x3F;
2238
        int b= (d>>11)&0x1F;
2239

    
2240
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2241
    }
2242
}
2243

    
2244
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2245
{
2246
    int i;
2247
    assert(src1 == src2);
2248
    for (i=0; i<width; i++)
2249
    {
2250
        int d0= ((uint32_t*)src1)[i];
2251

    
2252
        int dl= (d0&0x07E0F81F);
2253
        int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2254

    
2255
        int r= d&0x3F;
2256
        int b= (d>>11)&0x3F;
2257
        int g= d>>21;
2258
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2259
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2260
    }
2261
}
2262

    
2263
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2264
{
2265
    int i;
2266
    for (i=0; i<width; i++)
2267
    {
2268
        int d= ((uint16_t*)src)[i];
2269
        int r= d&0x1F;
2270
        int g= (d>>5)&0x1F;
2271
        int b= (d>>10)&0x1F;
2272

    
2273
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2274
    }
2275
}
2276

    
2277
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2278
{
2279
    int i;
2280
    assert(src1 == src2);
2281
    for (i=0; i<width; i++)
2282
    {
2283
        int d0= ((uint32_t*)src1)[i];
2284

    
2285
        int dl= (d0&0x03E07C1F);
2286
        int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2287

    
2288
        int r= d&0x3F;
2289
        int b= (d>>10)&0x3F;
2290
        int g= d>>21;
2291
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2292
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2293
    }
2294
}
2295

    
2296
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2297
{
2298
    int i;
2299
    for (i=0; i<width; i++)
2300
    {
2301
        int d= src[i];
2302

    
2303
        dst[i]= pal[d] & 0xFF;
2304
    }
2305
}
2306

    
2307
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2308
{
2309
    int i;
2310
    assert(src1 == src2);
2311
    for (i=0; i<width; i++)
2312
    {
2313
        int p= pal[src1[i]];
2314

    
2315
        dstU[i]= p>>8;
2316
        dstV[i]= p>>16;
2317
    }
2318
}
2319

    
2320
// bilinear / bicubic scaling
2321
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2322
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2323
{
2324
#ifdef HAVE_MMX
2325
    assert(filterSize % 4 == 0 && filterSize>0);
2326
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2327
    {
2328
        long counter= -2*dstW;
2329
        filter-= counter*2;
2330
        filterPos-= counter/2;
2331
        dst-= counter/2;
2332
        asm volatile(
2333
#if defined(PIC)
2334
        "push            %%"REG_b"              \n\t"
2335
#endif
2336
        "pxor                %%mm7, %%mm7       \n\t"
2337
        "movq        "MANGLE(w02)", %%mm6       \n\t"
2338
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2339
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2340
        ASMALIGN(4)
2341
        "1:                                     \n\t"
2342
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2343
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2344
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2345
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2346
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2347
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2348
        "punpcklbw           %%mm7, %%mm0       \n\t"
2349
        "punpcklbw           %%mm7, %%mm2       \n\t"
2350
        "pmaddwd             %%mm1, %%mm0       \n\t"
2351
        "pmaddwd             %%mm2, %%mm3       \n\t"
2352
        "psrad                  $8, %%mm0       \n\t"
2353
        "psrad                  $8, %%mm3       \n\t"
2354
        "packssdw            %%mm3, %%mm0       \n\t"
2355
        "pmaddwd             %%mm6, %%mm0       \n\t"
2356
        "packssdw            %%mm0, %%mm0       \n\t"
2357
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2358
        "add                    $4, %%"REG_BP"  \n\t"
2359
        " jnc                   1b              \n\t"
2360

    
2361
        "pop            %%"REG_BP"              \n\t"
2362
#if defined(PIC)
2363
        "pop             %%"REG_b"              \n\t"
2364
#endif
2365
        : "+a" (counter)
2366
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2367
#if !defined(PIC)
2368
        : "%"REG_b
2369
#endif
2370
        );
2371
    }
2372
    else if (filterSize==8)
2373
    {
2374
        long counter= -2*dstW;
2375
        filter-= counter*4;
2376
        filterPos-= counter/2;
2377
        dst-= counter/2;
2378
        asm volatile(
2379
#if defined(PIC)
2380
        "push             %%"REG_b"             \n\t"
2381
#endif
2382
        "pxor                 %%mm7, %%mm7      \n\t"
2383
        "movq         "MANGLE(w02)", %%mm6      \n\t"
2384
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2385
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2386
        ASMALIGN(4)
2387
        "1:                                     \n\t"
2388
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2389
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2390
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2391
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2392
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2393
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2394
        "punpcklbw            %%mm7, %%mm0      \n\t"
2395
        "punpcklbw            %%mm7, %%mm2      \n\t"
2396
        "pmaddwd              %%mm1, %%mm0      \n\t"
2397
        "pmaddwd              %%mm2, %%mm3      \n\t"
2398

    
2399
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2400
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2401
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2402
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2403
        "punpcklbw            %%mm7, %%mm4      \n\t"
2404
        "punpcklbw            %%mm7, %%mm2      \n\t"
2405
        "pmaddwd              %%mm1, %%mm4      \n\t"
2406
        "pmaddwd              %%mm2, %%mm5      \n\t"
2407
        "paddd                %%mm4, %%mm0      \n\t"
2408
        "paddd                %%mm5, %%mm3      \n\t"
2409

    
2410
        "psrad                   $8, %%mm0      \n\t"
2411
        "psrad                   $8, %%mm3      \n\t"
2412
        "packssdw             %%mm3, %%mm0      \n\t"
2413
        "pmaddwd              %%mm6, %%mm0      \n\t"
2414
        "packssdw             %%mm0, %%mm0      \n\t"
2415
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2416
        "add                     $4, %%"REG_BP" \n\t"
2417
        " jnc                    1b             \n\t"
2418

    
2419
        "pop             %%"REG_BP"             \n\t"
2420
#if defined(PIC)
2421
        "pop              %%"REG_b"             \n\t"
2422
#endif
2423
        : "+a" (counter)
2424
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2425
#if !defined(PIC)
2426
        : "%"REG_b
2427
#endif
2428
        );
2429
    }
2430
    else
2431
    {
2432
        uint8_t *offset = src+filterSize;
2433
        long counter= -2*dstW;
2434
        //filter-= counter*filterSize/2;
2435
        filterPos-= counter/2;
2436
        dst-= counter/2;
2437
        asm volatile(
2438
        "pxor                  %%mm7, %%mm7     \n\t"
2439
        "movq          "MANGLE(w02)", %%mm6     \n\t"
2440
        ASMALIGN(4)
2441
        "1:                                     \n\t"
2442
        "mov                      %2, %%"REG_c" \n\t"
2443
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2444
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2445
        "mov                      %5, %%"REG_c" \n\t"
2446
        "pxor                  %%mm4, %%mm4     \n\t"
2447
        "pxor                  %%mm5, %%mm5     \n\t"
2448
        "2:                                     \n\t"
2449
        "movq                   (%1), %%mm1     \n\t"
2450
        "movq               (%1, %6), %%mm3     \n\t"
2451
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2452
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2453
        "punpcklbw             %%mm7, %%mm0     \n\t"
2454
        "punpcklbw             %%mm7, %%mm2     \n\t"
2455
        "pmaddwd               %%mm1, %%mm0     \n\t"
2456
        "pmaddwd               %%mm2, %%mm3     \n\t"
2457
        "paddd                 %%mm3, %%mm5     \n\t"
2458
        "paddd                 %%mm0, %%mm4     \n\t"
2459
        "add                      $8, %1        \n\t"
2460
        "add                      $4, %%"REG_c" \n\t"
2461
        "cmp                      %4, %%"REG_c" \n\t"
2462
        " jb                      2b            \n\t"
2463
        "add                      %6, %1        \n\t"
2464
        "psrad                    $8, %%mm4     \n\t"
2465
        "psrad                    $8, %%mm5     \n\t"
2466
        "packssdw              %%mm5, %%mm4     \n\t"
2467
        "pmaddwd               %%mm6, %%mm4     \n\t"
2468
        "packssdw              %%mm4, %%mm4     \n\t"
2469
        "mov                      %3, %%"REG_a" \n\t"
2470
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2471
        "add                      $4, %0        \n\t"
2472
        " jnc                     1b            \n\t"
2473

    
2474
        : "+r" (counter), "+r" (filter)
2475
        : "m" (filterPos), "m" (dst), "m"(offset),
2476
          "m" (src), "r" (filterSize*2)
2477
        : "%"REG_a, "%"REG_c, "%"REG_d
2478
        );
2479
    }
2480
#else
2481
#ifdef HAVE_ALTIVEC
2482
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2483
#else
2484
    int i;
2485
    for (i=0; i<dstW; i++)
2486
    {
2487
        int j;
2488
        int srcPos= filterPos[i];
2489
        int val=0;
2490
        //printf("filterPos: %d\n", filterPos[i]);
2491
        for (j=0; j<filterSize; j++)
2492
        {
2493
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2494
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2495
        }
2496
        //filter += hFilterSize;
2497
        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2498
        //dst[i] = val>>7;
2499
    }
2500
#endif /* HAVE_ALTIVEC */
2501
#endif /* HAVE_MMX */
2502
}
2503
      // *** horizontal scale Y line to temp buffer
2504
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2505
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2506
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2507
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2508
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2509
{
2510
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2511
    {
2512
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2513
        src= formatConvBuffer;
2514
    }
2515
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2516
    {
2517
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2518
        src= formatConvBuffer;
2519
    }
2520
    else if (srcFormat==PIX_FMT_RGB32)
2521
    {
2522
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2523
        src= formatConvBuffer;
2524
    }
2525
    else if (srcFormat==PIX_FMT_BGR24)
2526
    {
2527
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2528
        src= formatConvBuffer;
2529
    }
2530
    else if (srcFormat==PIX_FMT_BGR565)
2531
    {
2532
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2533
        src= formatConvBuffer;
2534
    }
2535
    else if (srcFormat==PIX_FMT_BGR555)
2536
    {
2537
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2538
        src= formatConvBuffer;
2539
    }
2540
    else if (srcFormat==PIX_FMT_BGR32)
2541
    {
2542
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2543
        src= formatConvBuffer;
2544
    }
2545
    else if (srcFormat==PIX_FMT_RGB24)
2546
    {
2547
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2548
        src= formatConvBuffer;
2549
    }
2550
    else if (srcFormat==PIX_FMT_RGB565)
2551
    {
2552
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2553
        src= formatConvBuffer;
2554
    }
2555
    else if (srcFormat==PIX_FMT_RGB555)
2556
    {
2557
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2558
        src= formatConvBuffer;
2559
    }
2560
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2561
    {
2562
        RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2563
        src= formatConvBuffer;
2564
    }
2565

    
2566
#ifdef HAVE_MMX
2567
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2568
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2569
#else
2570
    if (!(flags&SWS_FAST_BILINEAR))
2571
#endif
2572
    {
2573
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2574
    }
2575
    else // fast bilinear upscale / crap downscale
2576
    {
2577
#if defined(ARCH_X86)
2578
#ifdef HAVE_MMX2
2579
        int i;
2580
#if defined(PIC)
2581
        uint64_t ebxsave __attribute__((aligned(8)));
2582
#endif
2583
        if (canMMX2BeUsed)
2584
        {
2585
            asm volatile(
2586
#if defined(PIC)
2587
            "mov               %%"REG_b", %5        \n\t"
2588
#endif
2589
            "pxor                  %%mm7, %%mm7     \n\t"
2590
            "mov                      %0, %%"REG_c" \n\t"
2591
            "mov                      %1, %%"REG_D" \n\t"
2592
            "mov                      %2, %%"REG_d" \n\t"
2593
            "mov                      %3, %%"REG_b" \n\t"
2594
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2595
            PREFETCH"        (%%"REG_c")            \n\t"
2596
            PREFETCH"      32(%%"REG_c")            \n\t"
2597
            PREFETCH"      64(%%"REG_c")            \n\t"
2598

    
2599
#ifdef ARCH_X86_64
2600

    
2601
#define FUNNY_Y_CODE \
2602
            "movl            (%%"REG_b"), %%esi     \n\t"\
2603
            "call                    *%4            \n\t"\
2604
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2605
            "add               %%"REG_S", %%"REG_c" \n\t"\
2606
            "add               %%"REG_a", %%"REG_D" \n\t"\
2607
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2608

    
2609
#else
2610

    
2611
#define FUNNY_Y_CODE \
2612
            "movl (%%"REG_b"), %%esi        \n\t"\
2613
            "call         *%4                       \n\t"\
2614
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2615
            "add               %%"REG_a", %%"REG_D" \n\t"\
2616
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2617

    
2618
#endif /* ARCH_X86_64 */
2619

    
2620
FUNNY_Y_CODE
2621
FUNNY_Y_CODE
2622
FUNNY_Y_CODE
2623
FUNNY_Y_CODE
2624
FUNNY_Y_CODE
2625
FUNNY_Y_CODE
2626
FUNNY_Y_CODE
2627
FUNNY_Y_CODE
2628

    
2629
#if defined(PIC)
2630
            "mov                      %5, %%"REG_b" \n\t"
2631
#endif
2632
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2633
            "m" (funnyYCode)
2634
#if defined(PIC)
2635
            ,"m" (ebxsave)
2636
#endif
2637
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2638
#if !defined(PIC)
2639
            ,"%"REG_b
2640
#endif
2641
            );
2642
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2643
        }
2644
        else
2645
        {
2646
#endif /* HAVE_MMX2 */
2647
        long xInc_shr16 = xInc >> 16;
2648
        uint16_t xInc_mask = xInc & 0xffff;
2649
        //NO MMX just normal asm ...
2650
        asm volatile(
2651
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2652
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2653
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2654
        ASMALIGN(4)
2655
        "1:                                  \n\t"
2656
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2657
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2658
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2659
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2660
        "shll      $16, %%edi                \n\t"
2661
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2662
        "mov        %1, %%"REG_D"            \n\t"
2663
        "shrl       $9, %%esi                \n\t"
2664
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2665
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2666
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2667

    
2668
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2669
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2670
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2671
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2672
        "shll      $16, %%edi                \n\t"
2673
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2674
        "mov        %1, %%"REG_D"            \n\t"
2675
        "shrl       $9, %%esi                \n\t"
2676
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2677
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2678
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2679

    
2680

    
2681
        "add        $2, %%"REG_a"            \n\t"
2682
        "cmp        %2, %%"REG_a"            \n\t"
2683
        " jb        1b                       \n\t"
2684

    
2685

    
2686
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2687
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2688
        );
2689
#ifdef HAVE_MMX2
2690
        } //if MMX2 can't be used
2691
#endif
2692
#else
2693
        int i;
2694
        unsigned int xpos=0;
2695
        for (i=0;i<dstWidth;i++)
2696
        {
2697
            register unsigned int xx=xpos>>16;
2698
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2699
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2700
            xpos+=xInc;
2701
        }
2702
#endif /* defined(ARCH_X86) */
2703
    }
2704
}
2705

    
2706
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2707
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2708
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2709
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2710
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2711
{
2712
    if (srcFormat==PIX_FMT_YUYV422)
2713
    {
2714
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2715
        src1= formatConvBuffer;
2716
        src2= formatConvBuffer+VOFW;
2717
    }
2718
    else if (srcFormat==PIX_FMT_UYVY422)
2719
    {
2720
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2721
        src1= formatConvBuffer;
2722
        src2= formatConvBuffer+VOFW;
2723
    }
2724
    else if (srcFormat==PIX_FMT_RGB32)
2725
    {
2726
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2727
        src1= formatConvBuffer;
2728
        src2= formatConvBuffer+VOFW;
2729
    }
2730
    else if (srcFormat==PIX_FMT_BGR24)
2731
    {
2732
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2733
        src1= formatConvBuffer;
2734
        src2= formatConvBuffer+VOFW;
2735
    }
2736
    else if (srcFormat==PIX_FMT_BGR565)
2737
    {
2738
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2739
        src1= formatConvBuffer;
2740
        src2= formatConvBuffer+VOFW;
2741
    }
2742
    else if (srcFormat==PIX_FMT_BGR555)
2743
    {
2744
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2745
        src1= formatConvBuffer;
2746
        src2= formatConvBuffer+VOFW;
2747
    }
2748
    else if (srcFormat==PIX_FMT_BGR32)
2749
    {
2750
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2751
        src1= formatConvBuffer;
2752
        src2= formatConvBuffer+VOFW;
2753
    }
2754
    else if (srcFormat==PIX_FMT_RGB24)
2755
    {
2756
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2757
        src1= formatConvBuffer;
2758
        src2= formatConvBuffer+VOFW;
2759
    }
2760
    else if (srcFormat==PIX_FMT_RGB565)
2761
    {
2762
        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2763
        src1= formatConvBuffer;
2764
        src2= formatConvBuffer+VOFW;
2765
    }
2766
    else if (srcFormat==PIX_FMT_RGB555)
2767
    {
2768
        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2769
        src1= formatConvBuffer;
2770
        src2= formatConvBuffer+VOFW;
2771
    }
2772
    else if (isGray(srcFormat))
2773
    {
2774
        return;
2775
    }
2776
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2777
    {
2778
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2779
        src1= formatConvBuffer;
2780
        src2= formatConvBuffer+VOFW;
2781
    }
2782

    
2783
#ifdef HAVE_MMX
2784
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2785
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2786
#else
2787
    if (!(flags&SWS_FAST_BILINEAR))
2788
#endif
2789
    {
2790
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2791
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2792
    }
2793
    else // fast bilinear upscale / crap downscale
2794
    {
2795
#if defined(ARCH_X86)
2796
#ifdef HAVE_MMX2
2797
        int i;
2798
#if defined(PIC)
2799
        uint64_t ebxsave __attribute__((aligned(8)));
2800
#endif
2801
        if (canMMX2BeUsed)
2802
        {
2803
            asm volatile(
2804
#if defined(PIC)
2805
            "mov          %%"REG_b", %6         \n\t"
2806
#endif
2807
            "pxor             %%mm7, %%mm7      \n\t"
2808
            "mov                 %0, %%"REG_c"  \n\t"
2809
            "mov                 %1, %%"REG_D"  \n\t"
2810
            "mov                 %2, %%"REG_d"  \n\t"
2811
            "mov                 %3, %%"REG_b"  \n\t"
2812
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2813
            PREFETCH"   (%%"REG_c")             \n\t"
2814
            PREFETCH" 32(%%"REG_c")             \n\t"
2815
            PREFETCH" 64(%%"REG_c")             \n\t"
2816

    
2817
#ifdef ARCH_X86_64
2818

    
2819
#define FUNNY_UV_CODE \
2820
            "movl       (%%"REG_b"), %%esi      \n\t"\
2821
            "call               *%4             \n\t"\
2822
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2823
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2824
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2825
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2826

    
2827
#else
2828

    
2829
#define FUNNY_UV_CODE \
2830
            "movl       (%%"REG_b"), %%esi      \n\t"\
2831
            "call               *%4             \n\t"\
2832
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2833
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2834
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2835

    
2836
#endif /* ARCH_X86_64 */
2837

    
2838
FUNNY_UV_CODE
2839
FUNNY_UV_CODE
2840
FUNNY_UV_CODE
2841
FUNNY_UV_CODE
2842
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2843
            "mov                 %5, %%"REG_c"  \n\t" // src
2844
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2845
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2846
            PREFETCH"   (%%"REG_c")             \n\t"
2847
            PREFETCH" 32(%%"REG_c")             \n\t"
2848
            PREFETCH" 64(%%"REG_c")             \n\t"
2849

    
2850
FUNNY_UV_CODE
2851
FUNNY_UV_CODE
2852
FUNNY_UV_CODE
2853
FUNNY_UV_CODE
2854

    
2855
#if defined(PIC)
2856
            "mov %6, %%"REG_b"    \n\t"
2857
#endif
2858
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2859
            "m" (funnyUVCode), "m" (src2)
2860
#if defined(PIC)
2861
            ,"m" (ebxsave)
2862
#endif
2863
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2864
#if !defined(PIC)
2865
             ,"%"REG_b
2866
#endif
2867
            );
2868
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2869
            {
2870
                //printf("%d %d %d\n", dstWidth, i, srcW);
2871
                dst[i] = src1[srcW-1]*128;
2872
                dst[i+VOFW] = src2[srcW-1]*128;
2873
            }
2874
        }
2875
        else
2876
        {
2877
#endif /* HAVE_MMX2 */
2878
            long xInc_shr16 = (long) (xInc >> 16);
2879
            uint16_t xInc_mask = xInc & 0xffff;
2880
            asm volatile(
2881
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2882
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2883
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2884
            ASMALIGN(4)
2885
            "1:                                     \n\t"
2886
            "mov        %0, %%"REG_S"               \n\t"
2887
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2888
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2889
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2890
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2891
            "shll      $16, %%edi                   \n\t"
2892
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2893
            "mov        %1, %%"REG_D"               \n\t"
2894
            "shrl       $9, %%esi                   \n\t"
2895
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2896

    
2897
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2898
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2899
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2900
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2901
            "shll      $16, %%edi                   \n\t"
2902
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2903
            "mov        %1, %%"REG_D"               \n\t"
2904
            "shrl       $9, %%esi                   \n\t"
2905
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2906

    
2907
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2908
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2909
            "add        $1, %%"REG_a"               \n\t"
2910
            "cmp        %2, %%"REG_a"               \n\t"
2911
            " jb        1b                          \n\t"
2912

    
2913
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2914
   which is needed to support GCC 4.0. */
2915
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2916
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2917
#else
2918
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2919
#endif
2920
            "r" (src2)
2921
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2922
            );
2923
#ifdef HAVE_MMX2
2924
        } //if MMX2 can't be used
2925
#endif
2926
#else
2927
        int i;
2928
        unsigned int xpos=0;
2929
        for (i=0;i<dstWidth;i++)
2930
        {
2931
            register unsigned int xx=xpos>>16;
2932
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2933
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2934
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2935
            /* slower
2936
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2937
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2938
            */
2939
            xpos+=xInc;
2940
        }
2941
#endif /* defined(ARCH_X86) */
2942
    }
2943
}
2944

    
2945
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2946
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2947

    
2948
    /* load a few things into local vars to make the code more readable? and faster */
2949
    const int srcW= c->srcW;
2950
    const int dstW= c->dstW;
2951
    const int dstH= c->dstH;
2952
    const int chrDstW= c->chrDstW;
2953
    const int chrSrcW= c->chrSrcW;
2954
    const int lumXInc= c->lumXInc;
2955
    const int chrXInc= c->chrXInc;
2956
    const int dstFormat= c->dstFormat;
2957
    const int srcFormat= c->srcFormat;
2958
    const int flags= c->flags;
2959
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2960
    int16_t *vLumFilterPos= c->vLumFilterPos;
2961
    int16_t *vChrFilterPos= c->vChrFilterPos;
2962
    int16_t *hLumFilterPos= c->hLumFilterPos;
2963
    int16_t *hChrFilterPos= c->hChrFilterPos;
2964
    int16_t *vLumFilter= c->vLumFilter;
2965
    int16_t *vChrFilter= c->vChrFilter;
2966
    int16_t *hLumFilter= c->hLumFilter;
2967
    int16_t *hChrFilter= c->hChrFilter;
2968
    int32_t *lumMmxFilter= c->lumMmxFilter;
2969
    int32_t *chrMmxFilter= c->chrMmxFilter;
2970
    const int vLumFilterSize= c->vLumFilterSize;
2971
    const int vChrFilterSize= c->vChrFilterSize;
2972
    const int hLumFilterSize= c->hLumFilterSize;
2973
    const int hChrFilterSize= c->hChrFilterSize;
2974
    int16_t **lumPixBuf= c->lumPixBuf;
2975
    int16_t **chrPixBuf= c->chrPixBuf;
2976
    const int vLumBufSize= c->vLumBufSize;
2977
    const int vChrBufSize= c->vChrBufSize;
2978
    uint8_t *funnyYCode= c->funnyYCode;
2979
    uint8_t *funnyUVCode= c->funnyUVCode;
2980
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2981
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2982
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2983
    int lastDstY;
2984
    uint8_t *pal=NULL;
2985

    
2986
    /* vars which will change and which we need to store back in the context */
2987
    int dstY= c->dstY;
2988
    int lumBufIndex= c->lumBufIndex;
2989
    int chrBufIndex= c->chrBufIndex;
2990
    int lastInLumBuf= c->lastInLumBuf;
2991
    int lastInChrBuf= c->lastInChrBuf;
2992

    
2993
    if (isPacked(c->srcFormat)){
2994
        pal= src[1];
2995
        src[0]=
2996
        src[1]=
2997
        src[2]= src[0];
2998
        srcStride[0]=
2999
        srcStride[1]=
3000
        srcStride[2]= srcStride[0];
3001
    }
3002
    srcStride[1]<<= c->vChrDrop;
3003
    srcStride[2]<<= c->vChrDrop;
3004

    
3005
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
3006
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
3007

    
3008
#if 0 //self test FIXME move to a vfilter or something
3009
    {
3010
    static volatile int i=0;
3011
    i++;
3012
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
3013
        selfTest(src, srcStride, c->srcW, c->srcH);
3014
    i--;
3015
    }
3016
#endif
3017

    
3018
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3019
    //dstStride[0],dstStride[1],dstStride[2]);
3020

    
3021
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3022
    {
3023
        static int firstTime=1; //FIXME move this into the context perhaps
3024
        if (flags & SWS_PRINT_INFO && firstTime)
3025
        {
3026
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3027
                   "         ->cannot do aligned memory accesses anymore\n");
3028
            firstTime=0;
3029
        }
3030
    }
3031

    
3032
    /* Note the user might start scaling the picture in the middle so this
3033
       will not get executed. This is not really intended but works
3034
       currently, so people might do it. */
3035
    if (srcSliceY ==0){
3036
        lumBufIndex=0;
3037
        chrBufIndex=0;
3038
        dstY=0;
3039
        lastInLumBuf= -1;
3040
        lastInChrBuf= -1;
3041
    }
3042

    
3043
    lastDstY= dstY;
3044

    
3045
    for (;dstY < dstH; dstY++){
3046
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
3047
        const int chrDstY= dstY>>c->chrDstVSubSample;
3048
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3049
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3050

    
3051
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3052
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3053
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3054
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3055

    
3056
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3057
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3058
        //handle holes (FAST_BILINEAR & weird filters)
3059
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3060
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3061
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3062
        assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
3063
        assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
3064

    
3065
        // Do we have enough lines in this slice to output the dstY line
3066
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3067
        {
3068
            //Do horizontal scaling
3069
            while(lastInLumBuf < lastLumSrcY)
3070
            {
3071
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3072
                lumBufIndex++;
3073
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3074
                assert(lumBufIndex < 2*vLumBufSize);
3075
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3076
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3077
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3078
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3079
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3080
                                funnyYCode, c->srcFormat, formatConvBuffer,
3081
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3082
                lastInLumBuf++;
3083
            }
3084
            while(lastInChrBuf < lastChrSrcY)
3085
            {
3086
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3087
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3088
                chrBufIndex++;
3089
                assert(chrBufIndex < 2*vChrBufSize);
3090
                assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
3091
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3092
                //FIXME replace parameters through context struct (some at least)
3093

    
3094
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3095
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3096
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3097
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3098
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3099
                lastInChrBuf++;
3100
            }
3101
            //wrap buf index around to stay inside the ring buffer
3102
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3103
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3104
        }
3105
        else // not enough lines left in this slice -> load the rest in the buffer
3106
        {
3107
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3108
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3109
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3110
            vChrBufSize, vLumBufSize);*/
3111

    
3112
            //Do horizontal scaling
3113
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3114
            {
3115
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3116
                lumBufIndex++;
3117
                assert(lumBufIndex < 2*vLumBufSize);
3118
                assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
3119
                assert(lastInLumBuf + 1 - srcSliceY >= 0);
3120
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3121
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3122
                                funnyYCode, c->srcFormat, formatConvBuffer,
3123
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3124
                lastInLumBuf++;
3125
            }
3126
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3127
            {
3128
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3129
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3130
                chrBufIndex++;
3131
                assert(chrBufIndex < 2*vChrBufSize);
3132
                assert(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH);
3133
                assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
3134

    
3135
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3136
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3137
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3138
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3139
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3140
                lastInChrBuf++;
3141
            }
3142
            //wrap buf index around to stay inside the ring buffer
3143
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3144
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3145
            break; //we can't output a dstY line so let's try with the next slice
3146
        }
3147

    
3148
#ifdef HAVE_MMX
3149
        b5Dither= ff_dither8[dstY&1];
3150
        g6Dither= ff_dither4[dstY&1];
3151
        g5Dither= ff_dither8[dstY&1];
3152
        r5Dither= ff_dither8[(dstY+1)&1];
3153
#endif
3154
        if (dstY < dstH-2)
3155
        {
3156
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3157
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3158
#ifdef HAVE_MMX
3159
            int i;
3160
        if (flags & SWS_ACCURATE_RND){
3161
            for (i=0; i<vLumFilterSize; i+=2){
3162
                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
3163
                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3164
                lumMmxFilter[2*i+2]=
3165
                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3166
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3167
            }
3168
            for (i=0; i<vChrFilterSize; i+=2){
3169
                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
3170
                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3171
                chrMmxFilter[2*i+2]=
3172
                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3173
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3174
            }
3175
        }else{
3176
            for (i=0; i<vLumFilterSize; i++)
3177
            {
3178
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3179
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3180
                lumMmxFilter[4*i+2]=
3181
                lumMmxFilter[4*i+3]=
3182
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3183
            }
3184
            for (i=0; i<vChrFilterSize; i++)
3185
            {
3186
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3187
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3188
                chrMmxFilter[4*i+2]=
3189
                chrMmxFilter[4*i+3]=
3190
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3191
            }
3192
        }
3193
#endif
3194
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3195
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3196
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3197
                RENAME(yuv2nv12X)(c,
3198
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3199
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3200
                    dest, uDest, dstW, chrDstW, dstFormat);
3201
            }
3202
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3203
            {
3204
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3205
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3206
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3207
                {
3208
                    int16_t *lumBuf = lumPixBuf[0];
3209
                    int16_t *chrBuf= chrPixBuf[0];
3210
                    RENAME(yuv2yuv1)(c, lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3211
                }
3212
                else //General YV12
3213
                {
3214
                    RENAME(yuv2yuvX)(c,
3215
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3216
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3217
                        dest, uDest, vDest, dstW, chrDstW);
3218
                }
3219
            }
3220
            else
3221
            {
3222
                assert(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3223
                assert(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3224
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3225
                {
3226
                    int chrAlpha= vChrFilter[2*dstY+1];
3227
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3228
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3229
                }
3230
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3231
                {
3232
                    int lumAlpha= vLumFilter[2*dstY+1];
3233
                    int chrAlpha= vChrFilter[2*dstY+1];
3234
                    lumMmxFilter[2]=
3235
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3236
                    chrMmxFilter[2]=
3237
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3238
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3239
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3240
                }
3241
                else //general RGB
3242
                {
3243
                    RENAME(yuv2packedX)(c,
3244
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3245
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3246
                        dest, dstW, dstY);
3247
                }
3248
            }
3249
        }
3250
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3251
        {
3252
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3253
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3254
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3255
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3256
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3257
                yuv2nv12XinC(
3258
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr,