Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 8a322796

History | View | Annotate | Download (133 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * The C code (not assembly, MMX, ...) of this file can be used
21
 * under the LGPL license.
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   "  x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+"  x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                                $16, %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
/*
185
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
       "r" (dest), "m" (dstW),
188
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190
*/
191
#define YSCALEYUV2PACKEDX \
192
    asm volatile(\
193
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
194
    ASMALIGN(4)\
195
    "nop                                            \n\t"\
196
    "1:                                             \n\t"\
197
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
198
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
199
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
200
    "movq                      %%mm3, %%mm4         \n\t"\
201
    ASMALIGN(4)\
202
    "2:                                             \n\t"\
203
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
204
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
205
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
206
    "add                         $16, %%"REG_d"     \n\t"\
207
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
208
    "pmulhw                    %%mm0, %%mm2         \n\t"\
209
    "pmulhw                    %%mm0, %%mm5         \n\t"\
210
    "paddw                     %%mm2, %%mm3         \n\t"\
211
    "paddw                     %%mm5, %%mm4         \n\t"\
212
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
213
    " jnz                         2b                \n\t"\
214
\
215
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
218
    "movq                      %%mm1, %%mm7         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
223
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
224
    "add                         $16, %%"REG_d"            \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm1         \n\t"\
229
    "paddw                     %%mm5, %%mm7         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX_END                 \
234
    :: "r" (&c->redDither),                   \
235
        "m" (dummy), "m" (dummy), "m" (dummy),\
236
        "r" (dest), "m" (dstW)                \
237
    : "%"REG_a, "%"REG_d, "%"REG_S            \
238
    );
239

    
240
#define YSCALEYUV2PACKEDX_ACCURATE \
241
    asm volatile(\
242
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
243
    ASMALIGN(4)\
244
    "nop                                            \n\t"\
245
    "1:                                             \n\t"\
246
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
247
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
248
    "pxor                      %%mm4, %%mm4         \n\t"\
249
    "pxor                      %%mm5, %%mm5         \n\t"\
250
    "pxor                      %%mm6, %%mm6         \n\t"\
251
    "pxor                      %%mm7, %%mm7         \n\t"\
252
    ASMALIGN(4)\
253
    "2:                                             \n\t"\
254
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
255
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
256
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
257
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
258
    "movq                      %%mm0, %%mm3         \n\t"\
259
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
260
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
261
    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
262
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
263
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
264
    "paddd                     %%mm0, %%mm4         \n\t"\
265
    "paddd                     %%mm3, %%mm5         \n\t"\
266
    "movq "AV_STRINGIFY(VOF)"(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
267
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
268
    "add                         $16, %%"REG_d"     \n\t"\
269
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
270
    "movq                      %%mm2, %%mm0         \n\t"\
271
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
272
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
273
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
274
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
275
    "paddd                     %%mm2, %%mm6         \n\t"\
276
    "paddd                     %%mm0, %%mm7         \n\t"\
277
    " jnz                         2b                \n\t"\
278
    "psrad                       $16, %%mm4         \n\t"\
279
    "psrad                       $16, %%mm5         \n\t"\
280
    "psrad                       $16, %%mm6         \n\t"\
281
    "psrad                       $16, %%mm7         \n\t"\
282
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
283
    "packssdw                  %%mm5, %%mm4         \n\t"\
284
    "packssdw                  %%mm7, %%mm6         \n\t"\
285
    "paddw                     %%mm0, %%mm4         \n\t"\
286
    "paddw                     %%mm0, %%mm6         \n\t"\
287
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
288
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
289
\
290
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
291
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
292
    "pxor                      %%mm1, %%mm1         \n\t"\
293
    "pxor                      %%mm5, %%mm5         \n\t"\
294
    "pxor                      %%mm7, %%mm7         \n\t"\
295
    "pxor                      %%mm6, %%mm6         \n\t"\
296
    ASMALIGN(4)\
297
    "2:                                             \n\t"\
298
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
299
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
300
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
301
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
302
    "movq                      %%mm0, %%mm3         \n\t"\
303
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
304
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
305
    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
306
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
307
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
308
    "paddd                     %%mm0, %%mm1         \n\t"\
309
    "paddd                     %%mm3, %%mm5         \n\t"\
310
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
311
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
312
    "add                         $16, %%"REG_d"     \n\t"\
313
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
314
    "movq                      %%mm2, %%mm0         \n\t"\
315
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
316
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
317
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
318
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
319
    "paddd                     %%mm2, %%mm7         \n\t"\
320
    "paddd                     %%mm0, %%mm6         \n\t"\
321
    " jnz                         2b                \n\t"\
322
    "psrad                       $16, %%mm1         \n\t"\
323
    "psrad                       $16, %%mm5         \n\t"\
324
    "psrad                       $16, %%mm7         \n\t"\
325
    "psrad                       $16, %%mm6         \n\t"\
326
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
327
    "packssdw                  %%mm5, %%mm1         \n\t"\
328
    "packssdw                  %%mm6, %%mm7         \n\t"\
329
    "paddw                     %%mm0, %%mm1         \n\t"\
330
    "paddw                     %%mm0, %%mm7         \n\t"\
331
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
332
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
333

    
334
#define YSCALEYUV2RGBX \
335
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
336
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
337
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
338
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
339
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
340
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
341
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
343
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
344
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
345
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
346
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
347
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
348
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
    "paddw           %%mm3, %%mm4       \n\t"\
350
    "movq            %%mm2, %%mm0       \n\t"\
351
    "movq            %%mm5, %%mm6       \n\t"\
352
    "movq            %%mm4, %%mm3       \n\t"\
353
    "punpcklwd       %%mm2, %%mm2       \n\t"\
354
    "punpcklwd       %%mm5, %%mm5       \n\t"\
355
    "punpcklwd       %%mm4, %%mm4       \n\t"\
356
    "paddw           %%mm1, %%mm2       \n\t"\
357
    "paddw           %%mm1, %%mm5       \n\t"\
358
    "paddw           %%mm1, %%mm4       \n\t"\
359
    "punpckhwd       %%mm0, %%mm0       \n\t"\
360
    "punpckhwd       %%mm6, %%mm6       \n\t"\
361
    "punpckhwd       %%mm3, %%mm3       \n\t"\
362
    "paddw           %%mm7, %%mm0       \n\t"\
363
    "paddw           %%mm7, %%mm6       \n\t"\
364
    "paddw           %%mm7, %%mm3       \n\t"\
365
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
    "packuswb        %%mm0, %%mm2       \n\t"\
367
    "packuswb        %%mm6, %%mm5       \n\t"\
368
    "packuswb        %%mm3, %%mm4       \n\t"\
369
    "pxor            %%mm7, %%mm7       \n\t"
370
#if 0
371
#define FULL_YSCALEYUV2RGB \
372
    "pxor                 %%mm7, %%mm7  \n\t"\
373
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
374
    "punpcklwd            %%mm6, %%mm6  \n\t"\
375
    "punpcklwd            %%mm6, %%mm6  \n\t"\
376
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
377
    "punpcklwd            %%mm5, %%mm5  \n\t"\
378
    "punpcklwd            %%mm5, %%mm5  \n\t"\
379
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
380
    ASMALIGN(4)\
381
    "1:                                 \n\t"\
382
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
383
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
384
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
385
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
386
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
387
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
    "movq "AV_STRINGIFY(VOF)"(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
392
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
    "movq "AV_STRINGIFY(VOF)"(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
395
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
398
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
399
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
400
\
401
\
402
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
404
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
405
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
407
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
409
\
410
\
411
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
412
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
413
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
414
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
415
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
416
    "packuswb             %%mm3, %%mm3  \n\t"\
417
\
418
    "packuswb             %%mm0, %%mm0  \n\t"\
419
    "paddw                %%mm4, %%mm2  \n\t"\
420
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
421
\
422
    "packuswb             %%mm1, %%mm1  \n\t"
423
#endif
424

    
425
#define REAL_YSCALEYUV2PACKED(index, c) \
426
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
427
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
428
    "psraw                $3, %%mm0                           \n\t"\
429
    "psraw                $3, %%mm1                           \n\t"\
430
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432
    "xor            "#index", "#index"                        \n\t"\
433
    ASMALIGN(4)\
434
    "1:                                 \n\t"\
435
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
436
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
437
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
438
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
439
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
442
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
449
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
450
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
451
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
452
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460

    
461
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
462

    
463
#define REAL_YSCALEYUV2RGB(index, c) \
464
    "xor            "#index", "#index"  \n\t"\
465
    ASMALIGN(4)\
466
    "1:                                 \n\t"\
467
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
468
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
469
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
470
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
471
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
474
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
481
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
482
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
483
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
484
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
485
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
486
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
488
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
489
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
490
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
491
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
492
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
493
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
500
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
501
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
502
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
503
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
504
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
505
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
    "paddw             %%mm3, %%mm4     \n\t"\
507
    "movq              %%mm2, %%mm0     \n\t"\
508
    "movq              %%mm5, %%mm6     \n\t"\
509
    "movq              %%mm4, %%mm3     \n\t"\
510
    "punpcklwd         %%mm2, %%mm2     \n\t"\
511
    "punpcklwd         %%mm5, %%mm5     \n\t"\
512
    "punpcklwd         %%mm4, %%mm4     \n\t"\
513
    "paddw             %%mm1, %%mm2     \n\t"\
514
    "paddw             %%mm1, %%mm5     \n\t"\
515
    "paddw             %%mm1, %%mm4     \n\t"\
516
    "punpckhwd         %%mm0, %%mm0     \n\t"\
517
    "punpckhwd         %%mm6, %%mm6     \n\t"\
518
    "punpckhwd         %%mm3, %%mm3     \n\t"\
519
    "paddw             %%mm7, %%mm0     \n\t"\
520
    "paddw             %%mm7, %%mm6     \n\t"\
521
    "paddw             %%mm7, %%mm3     \n\t"\
522
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
    "packuswb          %%mm0, %%mm2     \n\t"\
524
    "packuswb          %%mm6, %%mm5     \n\t"\
525
    "packuswb          %%mm3, %%mm4     \n\t"\
526
    "pxor              %%mm7, %%mm7     \n\t"
527
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
528

    
529
#define REAL_YSCALEYUV2PACKED1(index, c) \
530
    "xor            "#index", "#index"  \n\t"\
531
    ASMALIGN(4)\
532
    "1:                                 \n\t"\
533
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
534
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
535
    "psraw                $7, %%mm3     \n\t" \
536
    "psraw                $7, %%mm4     \n\t" \
537
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
538
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
539
    "psraw                $7, %%mm1     \n\t" \
540
    "psraw                $7, %%mm7     \n\t" \
541

    
542
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
543

    
544
#define REAL_YSCALEYUV2RGB1(index, c) \
545
    "xor            "#index", "#index"  \n\t"\
546
    ASMALIGN(4)\
547
    "1:                                 \n\t"\
548
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
549
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
550
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
553
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
554
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
555
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
556
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
557
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
558
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
560
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
561
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
564
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
565
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
566
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
567
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
568
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
569
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
    "paddw             %%mm3, %%mm4     \n\t"\
571
    "movq              %%mm2, %%mm0     \n\t"\
572
    "movq              %%mm5, %%mm6     \n\t"\
573
    "movq              %%mm4, %%mm3     \n\t"\
574
    "punpcklwd         %%mm2, %%mm2     \n\t"\
575
    "punpcklwd         %%mm5, %%mm5     \n\t"\
576
    "punpcklwd         %%mm4, %%mm4     \n\t"\
577
    "paddw             %%mm1, %%mm2     \n\t"\
578
    "paddw             %%mm1, %%mm5     \n\t"\
579
    "paddw             %%mm1, %%mm4     \n\t"\
580
    "punpckhwd         %%mm0, %%mm0     \n\t"\
581
    "punpckhwd         %%mm6, %%mm6     \n\t"\
582
    "punpckhwd         %%mm3, %%mm3     \n\t"\
583
    "paddw             %%mm7, %%mm0     \n\t"\
584
    "paddw             %%mm7, %%mm6     \n\t"\
585
    "paddw             %%mm7, %%mm3     \n\t"\
586
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
    "packuswb          %%mm0, %%mm2     \n\t"\
588
    "packuswb          %%mm6, %%mm5     \n\t"\
589
    "packuswb          %%mm3, %%mm4     \n\t"\
590
    "pxor              %%mm7, %%mm7     \n\t"
591
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
592

    
593
#define REAL_YSCALEYUV2PACKED1b(index, c) \
594
    "xor "#index", "#index"             \n\t"\
595
    ASMALIGN(4)\
596
    "1:                                 \n\t"\
597
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
598
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
599
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
600
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
601
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
    "psrlw                $8, %%mm3     \n\t" \
604
    "psrlw                $8, %%mm4     \n\t" \
605
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
606
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
607
    "psraw                $7, %%mm1     \n\t" \
608
    "psraw                $7, %%mm7     \n\t"
609
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
610

    
611
// do vertical chrominance interpolation
612
#define REAL_YSCALEYUV2RGB1b(index, c) \
613
    "xor            "#index", "#index"  \n\t"\
614
    ASMALIGN(4)\
615
    "1:                                 \n\t"\
616
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
617
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
618
    "movq "AV_STRINGIFY(VOF)"(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
619
    "movq "AV_STRINGIFY(VOF)"(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
620
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
623
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
624
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
625
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
626
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
627
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
628
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
629
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
630
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
632
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
633
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
636
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
637
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
638
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
639
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
640
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
641
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
    "paddw             %%mm3, %%mm4     \n\t"\
643
    "movq              %%mm2, %%mm0     \n\t"\
644
    "movq              %%mm5, %%mm6     \n\t"\
645
    "movq              %%mm4, %%mm3     \n\t"\
646
    "punpcklwd         %%mm2, %%mm2     \n\t"\
647
    "punpcklwd         %%mm5, %%mm5     \n\t"\
648
    "punpcklwd         %%mm4, %%mm4     \n\t"\
649
    "paddw             %%mm1, %%mm2     \n\t"\
650
    "paddw             %%mm1, %%mm5     \n\t"\
651
    "paddw             %%mm1, %%mm4     \n\t"\
652
    "punpckhwd         %%mm0, %%mm0     \n\t"\
653
    "punpckhwd         %%mm6, %%mm6     \n\t"\
654
    "punpckhwd         %%mm3, %%mm3     \n\t"\
655
    "paddw             %%mm7, %%mm0     \n\t"\
656
    "paddw             %%mm7, %%mm6     \n\t"\
657
    "paddw             %%mm7, %%mm3     \n\t"\
658
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
    "packuswb          %%mm0, %%mm2     \n\t"\
660
    "packuswb          %%mm6, %%mm5     \n\t"\
661
    "packuswb          %%mm3, %%mm4     \n\t"\
662
    "pxor              %%mm7, %%mm7     \n\t"
663
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
664

    
665
#define REAL_WRITEBGR32(dst, dstw, index) \
666
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
    "movq      %%mm2, %%mm1     \n\t" /* B */\
668
    "movq      %%mm5, %%mm6     \n\t" /* R */\
669
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
670
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
671
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
672
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
673
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
674
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
675
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
676
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
677
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
678
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
679
\
680
    MOVNTQ(%%mm0,   (dst, index, 4))\
681
    MOVNTQ(%%mm2,  8(dst, index, 4))\
682
    MOVNTQ(%%mm1, 16(dst, index, 4))\
683
    MOVNTQ(%%mm3, 24(dst, index, 4))\
684
\
685
    "add      $8, "#index"      \n\t"\
686
    "cmp "#dstw", "#index"      \n\t"\
687
    " jb      1b                \n\t"
688
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
689

    
690
#define REAL_WRITEBGR16(dst, dstw, index) \
691
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
692
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
693
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
694
    "psrlq           $3, %%mm2  \n\t"\
695
\
696
    "movq         %%mm2, %%mm1  \n\t"\
697
    "movq         %%mm4, %%mm3  \n\t"\
698
\
699
    "punpcklbw    %%mm7, %%mm3  \n\t"\
700
    "punpcklbw    %%mm5, %%mm2  \n\t"\
701
    "punpckhbw    %%mm7, %%mm4  \n\t"\
702
    "punpckhbw    %%mm5, %%mm1  \n\t"\
703
\
704
    "psllq           $3, %%mm3  \n\t"\
705
    "psllq           $3, %%mm4  \n\t"\
706
\
707
    "por          %%mm3, %%mm2  \n\t"\
708
    "por          %%mm4, %%mm1  \n\t"\
709
\
710
    MOVNTQ(%%mm2,  (dst, index, 2))\
711
    MOVNTQ(%%mm1, 8(dst, index, 2))\
712
\
713
    "add             $8, "#index"   \n\t"\
714
    "cmp        "#dstw", "#index"   \n\t"\
715
    " jb             1b             \n\t"
716
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
717

    
718
#define REAL_WRITEBGR15(dst, dstw, index) \
719
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
720
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
721
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
722
    "psrlq           $3, %%mm2  \n\t"\
723
    "psrlq           $1, %%mm5  \n\t"\
724
\
725
    "movq         %%mm2, %%mm1  \n\t"\
726
    "movq         %%mm4, %%mm3  \n\t"\
727
\
728
    "punpcklbw    %%mm7, %%mm3  \n\t"\
729
    "punpcklbw    %%mm5, %%mm2  \n\t"\
730
    "punpckhbw    %%mm7, %%mm4  \n\t"\
731
    "punpckhbw    %%mm5, %%mm1  \n\t"\
732
\
733
    "psllq           $2, %%mm3  \n\t"\
734
    "psllq           $2, %%mm4  \n\t"\
735
\
736
    "por          %%mm3, %%mm2  \n\t"\
737
    "por          %%mm4, %%mm1  \n\t"\
738
\
739
    MOVNTQ(%%mm2,  (dst, index, 2))\
740
    MOVNTQ(%%mm1, 8(dst, index, 2))\
741
\
742
    "add             $8, "#index"   \n\t"\
743
    "cmp        "#dstw", "#index"   \n\t"\
744
    " jb             1b             \n\t"
745
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
746

    
747
#define WRITEBGR24OLD(dst, dstw, index) \
748
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
    "movq      %%mm2, %%mm1             \n\t" /* B */\
750
    "movq      %%mm5, %%mm6             \n\t" /* R */\
751
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
752
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
753
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
754
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
755
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
756
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
757
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
758
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
759
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
760
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
761
\
762
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
763
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
764
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
765
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
766
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
767
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
768
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
769
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
770
\
771
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
772
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
773
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
774
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
775
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
776
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
777
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
778
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
779
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
780
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
781
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
782
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
783
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
784
\
785
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
786
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
787
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
788
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
789
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
790
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
791
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
792
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
793
\
794
    MOVNTQ(%%mm0,   (dst))\
795
    MOVNTQ(%%mm2,  8(dst))\
796
    MOVNTQ(%%mm3, 16(dst))\
797
    "add         $24, "#dst"            \n\t"\
798
\
799
    "add          $8, "#index"          \n\t"\
800
    "cmp     "#dstw", "#index"          \n\t"\
801
    " jb          1b                    \n\t"
802

    
803
#define WRITEBGR24MMX(dst, dstw, index) \
804
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
    "movq      %%mm2, %%mm1     \n\t" /* B */\
806
    "movq      %%mm5, %%mm6     \n\t" /* R */\
807
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
808
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
809
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
810
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
811
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
812
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
813
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
814
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
815
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
816
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
817
\
818
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
819
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
820
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
821
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
822
\
823
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
824
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
825
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
826
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
827
\
828
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
829
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
830
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
831
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
832
\
833
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
834
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
835
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
836
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
837
    MOVNTQ(%%mm0, (dst))\
838
\
839
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
840
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
841
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
842
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
843
    MOVNTQ(%%mm6, 8(dst))\
844
\
845
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
846
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
847
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
848
    MOVNTQ(%%mm5, 16(dst))\
849
\
850
    "add         $24, "#dst"    \n\t"\
851
\
852
    "add          $8, "#index"  \n\t"\
853
    "cmp     "#dstw", "#index"  \n\t"\
854
    " jb          1b            \n\t"
855

    
856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
859
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
860
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
861
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
862
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
863
\
864
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
865
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
866
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
867
\
868
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
869
    "por    %%mm1, %%mm6        \n\t"\
870
    "por    %%mm3, %%mm6        \n\t"\
871
    MOVNTQ(%%mm6, (dst))\
872
\
873
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
874
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
875
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
876
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
877
\
878
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
879
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
880
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
881
\
882
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
883
    "por    %%mm3, %%mm6        \n\t"\
884
    MOVNTQ(%%mm6, 8(dst))\
885
\
886
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
887
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
888
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
889
\
890
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
891
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
892
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
893
\
894
    "por    %%mm1, %%mm3        \n\t"\
895
    "por    %%mm3, %%mm6        \n\t"\
896
    MOVNTQ(%%mm6, 16(dst))\
897
\
898
    "add      $24, "#dst"       \n\t"\
899
\
900
    "add       $8, "#index"     \n\t"\
901
    "cmp  "#dstw", "#index"     \n\t"\
902
    " jb       1b               \n\t"
903

    
904
#ifdef HAVE_MMX2
905
#undef WRITEBGR24
906
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
907
#else
908
#undef WRITEBGR24
909
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
910
#endif
911

    
912
#define REAL_WRITEYUY2(dst, dstw, index) \
913
    "packuswb  %%mm3, %%mm3     \n\t"\
914
    "packuswb  %%mm4, %%mm4     \n\t"\
915
    "packuswb  %%mm7, %%mm1     \n\t"\
916
    "punpcklbw %%mm4, %%mm3     \n\t"\
917
    "movq      %%mm1, %%mm7     \n\t"\
918
    "punpcklbw %%mm3, %%mm1     \n\t"\
919
    "punpckhbw %%mm3, %%mm7     \n\t"\
920
\
921
    MOVNTQ(%%mm1, (dst, index, 2))\
922
    MOVNTQ(%%mm7, 8(dst, index, 2))\
923
\
924
    "add          $8, "#index"  \n\t"\
925
    "cmp     "#dstw", "#index"  \n\t"\
926
    " jb          1b            \n\t"
927
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
928

    
929

    
930
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
933
{
934
#ifdef HAVE_MMX
935
    if (c->flags & SWS_ACCURATE_RND){
936
        if (uDest){
937
            YSCALEYUV2YV12X_ACCURATE(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
            YSCALEYUV2YV12X_ACCURATE(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939
        }
940

    
941
        YSCALEYUV2YV12X_ACCURATE("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
942
    }else{
943
        if (uDest){
944
            YSCALEYUV2YV12X(   "0", CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
            YSCALEYUV2YV12X(AV_STRINGIFY(VOF), CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946
        }
947

    
948
        YSCALEYUV2YV12X("0", LUM_MMX_FILTER_OFFSET, dest, dstW)
949
    }
950
#else
951
#ifdef HAVE_ALTIVEC
952
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953
                      chrFilter, chrSrc, chrFilterSize,
954
                      dest, uDest, vDest, dstW, chrDstW);
955
#else //HAVE_ALTIVEC
956
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957
            chrFilter, chrSrc, chrFilterSize,
958
            dest, uDest, vDest, dstW, chrDstW);
959
#endif //!HAVE_ALTIVEC
960
#endif /* HAVE_MMX */
961
}
962

    
963
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
966
{
967
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968
             chrFilter, chrSrc, chrFilterSize,
969
             dest, uDest, dstW, chrDstW, dstFormat);
970
}
971

    
972
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
974
{
975
#ifdef HAVE_MMX
976
    if (uDest)
977
    {
978
        asm volatile(
979
            YSCALEYUV2YV121
980
            :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981
            "g" (-chrDstW)
982
            : "%"REG_a
983
        );
984

    
985
        asm volatile(
986
            YSCALEYUV2YV121
987
            :: "r" (chrSrc + VOFW + chrDstW), "r" (vDest + chrDstW),
988
            "g" (-chrDstW)
989
            : "%"REG_a
990
        );
991
    }
992

    
993
    asm volatile(
994
        YSCALEYUV2YV121
995
        :: "r" (lumSrc + dstW), "r" (dest + dstW),
996
        "g" (-dstW)
997
        : "%"REG_a
998
    );
999
#else
1000
    int i;
1001
    for (i=0; i<dstW; i++)
1002
    {
1003
        int val= lumSrc[i]>>7;
1004

    
1005
        if (val&256){
1006
            if (val<0) val=0;
1007
            else       val=255;
1008
        }
1009

    
1010
        dest[i]= val;
1011
    }
1012

    
1013
    if (uDest)
1014
        for (i=0; i<chrDstW; i++)
1015
        {
1016
            int u=chrSrc[i]>>7;
1017
            int v=chrSrc[i + VOFW]>>7;
1018

    
1019
            if ((u|v)&256){
1020
                if (u<0)        u=0;
1021
                else if (u>255) u=255;
1022
                if (v<0)        v=0;
1023
                else if (v>255) v=255;
1024
            }
1025

    
1026
            uDest[i]= u;
1027
            vDest[i]= v;
1028
        }
1029
#endif
1030
}
1031

    
1032

    
1033
/**
1034
 * vertical scale YV12 to RGB
1035
 */
1036
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038
                                       uint8_t *dest, long dstW, long dstY)
1039
{
1040
#ifdef HAVE_MMX
1041
    long dummy=0;
1042
    if (c->flags & SWS_ACCURATE_RND){
1043
        switch(c->dstFormat){
1044
        case PIX_FMT_RGB32:
1045
            YSCALEYUV2PACKEDX_ACCURATE
1046
            YSCALEYUV2RGBX
1047
            WRITEBGR32(%4, %5, %%REGa)
1048

    
1049
            YSCALEYUV2PACKEDX_END
1050
            return;
1051
        case PIX_FMT_BGR24:
1052
            YSCALEYUV2PACKEDX_ACCURATE
1053
            YSCALEYUV2RGBX
1054
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055
            "add %4, %%"REG_c"                        \n\t"
1056
            WRITEBGR24(%%REGc, %5, %%REGa)
1057

    
1058

    
1059
            :: "r" (&c->redDither),
1060
               "m" (dummy), "m" (dummy), "m" (dummy),
1061
               "r" (dest), "m" (dstW)
1062
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063
            );
1064
            return;
1065
        case PIX_FMT_BGR555:
1066
            YSCALEYUV2PACKEDX_ACCURATE
1067
            YSCALEYUV2RGBX
1068
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069
#ifdef DITHER1XBPP
1070
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073
#endif
1074

    
1075
            WRITEBGR15(%4, %5, %%REGa)
1076
            YSCALEYUV2PACKEDX_END
1077
            return;
1078
        case PIX_FMT_BGR565:
1079
            YSCALEYUV2PACKEDX_ACCURATE
1080
            YSCALEYUV2RGBX
1081
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082
#ifdef DITHER1XBPP
1083
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086
#endif
1087

    
1088
            WRITEBGR16(%4, %5, %%REGa)
1089
            YSCALEYUV2PACKEDX_END
1090
            return;
1091
        case PIX_FMT_YUYV422:
1092
            YSCALEYUV2PACKEDX_ACCURATE
1093
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094

    
1095
            "psraw $3, %%mm3    \n\t"
1096
            "psraw $3, %%mm4    \n\t"
1097
            "psraw $3, %%mm1    \n\t"
1098
            "psraw $3, %%mm7    \n\t"
1099
            WRITEYUY2(%4, %5, %%REGa)
1100
            YSCALEYUV2PACKEDX_END
1101
            return;
1102
    }
1103
    }else{
1104
        switch(c->dstFormat)
1105
        {
1106
        case PIX_FMT_RGB32:
1107
            YSCALEYUV2PACKEDX
1108
            YSCALEYUV2RGBX
1109
            WRITEBGR32(%4, %5, %%REGa)
1110
            YSCALEYUV2PACKEDX_END
1111
            return;
1112
        case PIX_FMT_BGR24:
1113
            YSCALEYUV2PACKEDX
1114
            YSCALEYUV2RGBX
1115
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1116
            "add                        %4, %%"REG_c"   \n\t"
1117
            WRITEBGR24(%%REGc, %5, %%REGa)
1118

    
1119
            :: "r" (&c->redDither),
1120
               "m" (dummy), "m" (dummy), "m" (dummy),
1121
               "r" (dest),  "m" (dstW)
1122
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123
            );
1124
            return;
1125
        case PIX_FMT_BGR555:
1126
            YSCALEYUV2PACKEDX
1127
            YSCALEYUV2RGBX
1128
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129
#ifdef DITHER1XBPP
1130
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1131
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1132
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1133
#endif
1134

    
1135
            WRITEBGR15(%4, %5, %%REGa)
1136
            YSCALEYUV2PACKEDX_END
1137
            return;
1138
        case PIX_FMT_BGR565:
1139
            YSCALEYUV2PACKEDX
1140
            YSCALEYUV2RGBX
1141
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142
#ifdef DITHER1XBPP
1143
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1144
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1145
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1146
#endif
1147

    
1148
            WRITEBGR16(%4, %5, %%REGa)
1149
            YSCALEYUV2PACKEDX_END
1150
            return;
1151
        case PIX_FMT_YUYV422:
1152
            YSCALEYUV2PACKEDX
1153
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154

    
1155
            "psraw $3, %%mm3    \n\t"
1156
            "psraw $3, %%mm4    \n\t"
1157
            "psraw $3, %%mm1    \n\t"
1158
            "psraw $3, %%mm7    \n\t"
1159
            WRITEYUY2(%4, %5, %%REGa)
1160
            YSCALEYUV2PACKEDX_END
1161
            return;
1162
        }
1163
    }
1164
#endif /* HAVE_MMX */
1165
#ifdef HAVE_ALTIVEC
1166
    /* The following list of supported dstFormat values should
1167
       match what's found in the body of altivec_yuv2packedX() */
1168
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1169
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1171
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172
                                 chrFilter, chrSrc, chrFilterSize,
1173
                                 dest, dstW, dstY);
1174
    else
1175
#endif
1176
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177
                       chrFilter, chrSrc, chrFilterSize,
1178
                       dest, dstW, dstY);
1179
}
1180

    
1181
/**
1182
 * vertical bilinear scale YV12 to RGB
1183
 */
1184
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1186
{
1187
    int yalpha1=yalpha^4095;
1188
    int uvalpha1=uvalpha^4095;
1189
    int i;
1190

    
1191
#if 0 //isn't used
1192
    if (flags&SWS_FULL_CHR_H_INT)
1193
    {
1194
        switch(dstFormat)
1195
        {
1196
#ifdef HAVE_MMX
1197
        case PIX_FMT_RGB32:
1198
            asm volatile(
1199

1200

1201
FULL_YSCALEYUV2RGB
1202
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1203
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1204

1205
            "movq      %%mm3, %%mm1    \n\t"
1206
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1207
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1208

1209
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1210
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1211

1212
            "add $4, %%"REG_a"  \n\t"
1213
            "cmp %5, %%"REG_a"  \n\t"
1214
            " jb 1b             \n\t"
1215

1216
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217
            "m" (yalpha1), "m" (uvalpha1)
1218
            : "%"REG_a
1219
            );
1220
            break;
1221
        case PIX_FMT_BGR24:
1222
            asm volatile(
1223

1224
FULL_YSCALEYUV2RGB
1225

1226
                                              // lsb ... msb
1227
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1228
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1229

1230
            "movq      %%mm3, %%mm1     \n\t"
1231
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1232
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1233

1234
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1235
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1236
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1237
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1238
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1239
            "movq      %%mm1, %%mm2     \n\t"
1240
            "psllq       $48, %%mm1     \n\t" // 000000BG
1241
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1242

1243
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1244
            "psrld       $16, %%mm2     \n\t" // R000R000
1245
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1246
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1247

1248
            "mov          %4, %%"REG_b" \n\t"
1249
            "add   %%"REG_a", %%"REG_b" \n\t"
1250

1251
#ifdef HAVE_MMX2
1252
            //FIXME Alignment
1253
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1254
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1255
#else
1256
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1257
            "psrlq  $32, %%mm3                          \n\t"
1258
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1259
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1260
#endif
1261
            "add     $4, %%"REG_a"                      \n\t"
1262
            "cmp     %5, %%"REG_a"                      \n\t"
1263
            " jb     1b                                 \n\t"
1264

    
1265
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266
            "m" (yalpha1), "m" (uvalpha1)
1267
            : "%"REG_a, "%"REG_b
1268
            );
1269
            break;
1270
        case PIX_FMT_BGR555:
1271
            asm volatile(
1272

    
1273
FULL_YSCALEYUV2RGB
1274
#ifdef DITHER1XBPP
1275
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1276
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1277
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1278
#endif
1279
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1280
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1281
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1282

    
1283
            "psrlw                   $3, %%mm3  \n\t"
1284
            "psllw                   $2, %%mm1  \n\t"
1285
            "psllw                   $7, %%mm0  \n\t"
1286
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1287
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1288

    
1289
            "por                  %%mm3, %%mm1  \n\t"
1290
            "por                  %%mm1, %%mm0  \n\t"
1291

    
1292
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1293

    
1294
            "add $4, %%"REG_a"  \n\t"
1295
            "cmp %5, %%"REG_a"  \n\t"
1296
            " jb 1b             \n\t"
1297

    
1298
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299
            "m" (yalpha1), "m" (uvalpha1)
1300
            : "%"REG_a
1301
            );
1302
            break;
1303
        case PIX_FMT_BGR565:
1304
            asm volatile(
1305

    
1306
FULL_YSCALEYUV2RGB
1307
#ifdef DITHER1XBPP
1308
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1309
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1310
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1311
#endif
1312
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1313
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1314
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1315

    
1316
            "psrlw                   $3, %%mm3  \n\t"
1317
            "psllw                   $3, %%mm1  \n\t"
1318
            "psllw                   $8, %%mm0  \n\t"
1319
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1320
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1321

    
1322
            "por                  %%mm3, %%mm1  \n\t"
1323
            "por                  %%mm1, %%mm0  \n\t"
1324

    
1325
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1326

    
1327
            "add $4, %%"REG_a"  \n\t"
1328
            "cmp %5, %%"REG_a"  \n\t"
1329
            " jb 1b             \n\t"
1330

    
1331
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332
            "m" (yalpha1), "m" (uvalpha1)
1333
            : "%"REG_a
1334
            );
1335
            break;
1336
#endif /* HAVE_MMX */
1337
        case PIX_FMT_BGR32:
1338
#ifndef HAVE_MMX
1339
        case PIX_FMT_RGB32:
1340
#endif
1341
            if (dstFormat==PIX_FMT_RGB32)
1342
            {
1343
                int i;
1344
#ifdef WORDS_BIGENDIAN
1345
                dest++;
1346
#endif
1347
                for (i=0;i<dstW;i++){
1348
                    // vertical linear interpolation && yuv2rgb in a single step:
1349
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1352
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355
                    dest+= 4;
1356
                }
1357
            }
1358
            else if (dstFormat==PIX_FMT_BGR24)
1359
            {
1360
                int i;
1361
                for (i=0;i<dstW;i++){
1362
                    // vertical linear interpolation && yuv2rgb in a single step:
1363
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1366
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369
                    dest+= 3;
1370
                }
1371
            }
1372
            else if (dstFormat==PIX_FMT_BGR565)
1373
            {
1374
                int i;
1375
                for (i=0;i<dstW;i++){
1376
                    // vertical linear interpolation && yuv2rgb in a single step:
1377
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1380

    
1381
                    ((uint16_t*)dest)[i] =
1382
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1385
                }
1386
            }
1387
            else if (dstFormat==PIX_FMT_BGR555)
1388
            {
1389
                int i;
1390
                for (i=0;i<dstW;i++){
1391
                    // vertical linear interpolation && yuv2rgb in a single step:
1392
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394
                    int V=((uvbuf0[i+VOFW]*uvalpha1+uvbuf1[i+VOFW]*uvalpha)>>19);
1395

    
1396
                    ((uint16_t*)dest)[i] =
1397
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1400
                }
1401
            }
1402
        }//FULL_UV_IPOL
1403
    else
1404
    {
1405
#endif // if 0
1406
#ifdef HAVE_MMX
1407
        switch(c->dstFormat)
1408
        {
1409
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410
            case PIX_FMT_RGB32:
1411
                asm volatile(
1412
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1413
                "mov        %4, %%"REG_b"               \n\t"
1414
                "push %%"REG_BP"                        \n\t"
1415
                YSCALEYUV2RGB(%%REGBP, %5)
1416
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417
                "pop %%"REG_BP"                         \n\t"
1418
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1419

    
1420
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421
                "a" (&c->redDither)
1422
                );
1423
                return;
1424
            case PIX_FMT_BGR24:
1425
                asm volatile(
1426
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1427
                "mov        %4, %%"REG_b"               \n\t"
1428
                "push %%"REG_BP"                        \n\t"
1429
                YSCALEYUV2RGB(%%REGBP, %5)
1430
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431
                "pop %%"REG_BP"                         \n\t"
1432
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1433
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434
                "a" (&c->redDither)
1435
                );
1436
                return;
1437
            case PIX_FMT_BGR555:
1438
                asm volatile(
1439
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1440
                "mov        %4, %%"REG_b"               \n\t"
1441
                "push %%"REG_BP"                        \n\t"
1442
                YSCALEYUV2RGB(%%REGBP, %5)
1443
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444
#ifdef DITHER1XBPP
1445
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1446
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1447
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1448
#endif
1449

    
1450
                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451
                "pop %%"REG_BP"                         \n\t"
1452
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1453

    
1454
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455
                "a" (&c->redDither)
1456
                );
1457
                return;
1458
            case PIX_FMT_BGR565:
1459
                asm volatile(
1460
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1461
                "mov        %4, %%"REG_b"               \n\t"
1462
                "push %%"REG_BP"                        \n\t"
1463
                YSCALEYUV2RGB(%%REGBP, %5)
1464
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465
#ifdef DITHER1XBPP
1466
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1467
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1468
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1469
#endif
1470

    
1471
                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472
                "pop %%"REG_BP"                         \n\t"
1473
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1474
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                "a" (&c->redDither)
1476
                );
1477
                return;
1478
            case PIX_FMT_YUYV422:
1479
                asm volatile(
1480
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1481
                "mov %4, %%"REG_b"                        \n\t"
1482
                "push %%"REG_BP"                        \n\t"
1483
                YSCALEYUV2PACKED(%%REGBP, %5)
1484
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485
                "pop %%"REG_BP"                         \n\t"
1486
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1487
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488
                "a" (&c->redDither)
1489
                );
1490
                return;
1491
            default: break;
1492
        }
1493
#endif //HAVE_MMX
1494
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1495
}
1496

    
1497
/**
1498
 * YV12 to RGB without scaling or interpolating
1499
 */
1500
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1502
{
1503
    const int yalpha1=0;
1504
    int i;
1505

    
1506
    uint16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1507
    const int yalpha= 4096; //FIXME ...
1508

    
1509
    if (flags&SWS_FULL_CHR_H_INT)
1510
    {
1511
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512
        return;
1513
    }
1514

    
1515
#ifdef HAVE_MMX
1516
    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1517
    {
1518
        switch(dstFormat)
1519
        {
1520
        case PIX_FMT_RGB32:
1521
            asm volatile(
1522
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1523
            "mov        %4, %%"REG_b"               \n\t"
1524
            "push %%"REG_BP"                        \n\t"
1525
            YSCALEYUV2RGB1(%%REGBP, %5)
1526
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527
            "pop %%"REG_BP"                         \n\t"
1528
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1529

    
1530
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531
            "a" (&c->redDither)
1532
            );
1533
            return;
1534
        case PIX_FMT_BGR24:
1535
            asm volatile(
1536
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1537
            "mov        %4, %%"REG_b"               \n\t"
1538
            "push %%"REG_BP"                        \n\t"
1539
            YSCALEYUV2RGB1(%%REGBP, %5)
1540
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541
            "pop %%"REG_BP"                         \n\t"
1542
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543

    
1544
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545
            "a" (&c->redDither)
1546
            );
1547
            return;
1548
        case PIX_FMT_BGR555:
1549
            asm volatile(
1550
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
            "mov        %4, %%"REG_b"               \n\t"
1552
            "push %%"REG_BP"                        \n\t"
1553
            YSCALEYUV2RGB1(%%REGBP, %5)
1554
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555
#ifdef DITHER1XBPP
1556
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1557
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1558
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1559
#endif
1560
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561
            "pop %%"REG_BP"                         \n\t"
1562
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1563

    
1564
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565
            "a" (&c->redDither)
1566
            );
1567
            return;
1568
        case PIX_FMT_BGR565:
1569
            asm volatile(
1570
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1571
            "mov        %4, %%"REG_b"               \n\t"
1572
            "push %%"REG_BP"                        \n\t"
1573
            YSCALEYUV2RGB1(%%REGBP, %5)
1574
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575
#ifdef DITHER1XBPP
1576
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1577
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1578
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1579
#endif
1580

    
1581
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582
            "pop %%"REG_BP"                         \n\t"
1583
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1584

    
1585
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586
            "a" (&c->redDither)
1587
            );
1588
            return;
1589
        case PIX_FMT_YUYV422:
1590
            asm volatile(
1591
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1592
            "mov        %4, %%"REG_b"               \n\t"
1593
            "push %%"REG_BP"                        \n\t"
1594
            YSCALEYUV2PACKED1(%%REGBP, %5)
1595
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596
            "pop %%"REG_BP"                         \n\t"
1597
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1598

    
1599
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600
            "a" (&c->redDither)
1601
            );
1602
            return;
1603
        }
1604
    }
1605
    else
1606
    {
1607
        switch(dstFormat)
1608
        {
1609
        case PIX_FMT_RGB32:
1610
            asm volatile(
1611
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1612
            "mov        %4, %%"REG_b"               \n\t"
1613
            "push %%"REG_BP"                        \n\t"
1614
            YSCALEYUV2RGB1b(%%REGBP, %5)
1615
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616
            "pop %%"REG_BP"                         \n\t"
1617
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1618

    
1619
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620
            "a" (&c->redDither)
1621
            );
1622
            return;
1623
        case PIX_FMT_BGR24:
1624
            asm volatile(
1625
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1626
            "mov        %4, %%"REG_b"               \n\t"
1627
            "push %%"REG_BP"                        \n\t"
1628
            YSCALEYUV2RGB1b(%%REGBP, %5)
1629
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630
            "pop %%"REG_BP"                         \n\t"
1631
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1632

    
1633
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634
            "a" (&c->redDither)
1635
            );
1636
            return;
1637
        case PIX_FMT_BGR555:
1638
            asm volatile(
1639
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1640
            "mov        %4, %%"REG_b"               \n\t"
1641
            "push %%"REG_BP"                        \n\t"
1642
            YSCALEYUV2RGB1b(%%REGBP, %5)
1643
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644
#ifdef DITHER1XBPP
1645
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1646
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1647
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1648
#endif
1649
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650
            "pop %%"REG_BP"                         \n\t"
1651
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1652

    
1653
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654
            "a" (&c->redDither)
1655
            );
1656
            return;
1657
        case PIX_FMT_BGR565:
1658
            asm volatile(
1659
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1660
            "mov        %4, %%"REG_b"               \n\t"
1661
            "push %%"REG_BP"                        \n\t"
1662
            YSCALEYUV2RGB1b(%%REGBP, %5)
1663
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664
#ifdef DITHER1XBPP
1665
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1666
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1667
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1668
#endif
1669

    
1670
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671
            "pop %%"REG_BP"                         \n\t"
1672
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1673

    
1674
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675
            "a" (&c->redDither)
1676
            );
1677
            return;
1678
        case PIX_FMT_YUYV422:
1679
            asm volatile(
1680
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1681
            "mov        %4, %%"REG_b"               \n\t"
1682
            "push %%"REG_BP"                        \n\t"
1683
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1684
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685
            "pop %%"REG_BP"                         \n\t"
1686
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1687

    
1688
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689
            "a" (&c->redDither)
1690
            );
1691
            return;
1692
        }
1693
    }
1694
#endif /* HAVE_MMX */
1695
    if (uvalpha < 2048)
1696
    {
1697
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698
    }else{
1699
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1700
    }
1701
}
1702

    
1703
//FIXME yuy2* can read up to 7 samples too much
1704

    
1705
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1706
{
1707
#ifdef HAVE_MMX
1708
    asm volatile(
1709
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1710
    "mov                    %0, %%"REG_a"       \n\t"
1711
    "1:                                         \n\t"
1712
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1713
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1714
    "pand                %%mm2, %%mm0           \n\t"
1715
    "pand                %%mm2, %%mm1           \n\t"
1716
    "packuswb            %%mm1, %%mm0           \n\t"
1717
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1718
    "add                    $8, %%"REG_a"       \n\t"
1719
    " js                    1b                  \n\t"
1720
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721
    : "%"REG_a
1722
    );
1723
#else
1724
    int i;
1725
    for (i=0; i<width; i++)
1726
        dst[i]= src[2*i];
1727
#endif
1728
}
1729

    
1730
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1731
{
1732
#ifdef HAVE_MMX
1733
    asm volatile(
1734
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1735
    "mov                    %0, %%"REG_a"       \n\t"
1736
    "1:                                         \n\t"
1737
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1738
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1739
    "psrlw                  $8, %%mm0           \n\t"
1740
    "psrlw                  $8, %%mm1           \n\t"
1741
    "packuswb            %%mm1, %%mm0           \n\t"
1742
    "movq                %%mm0, %%mm1           \n\t"
1743
    "psrlw                  $8, %%mm0           \n\t"
1744
    "pand                %%mm4, %%mm1           \n\t"
1745
    "packuswb            %%mm0, %%mm0           \n\t"
1746
    "packuswb            %%mm1, %%mm1           \n\t"
1747
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1748
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1749
    "add                    $4, %%"REG_a"       \n\t"
1750
    " js                    1b                  \n\t"
1751
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752
    : "%"REG_a
1753
    );
1754
#else
1755
    int i;
1756
    for (i=0; i<width; i++)
1757
    {
1758
        dstU[i]= src1[4*i + 1];
1759
        dstV[i]= src1[4*i + 3];
1760
    }
1761
#endif
1762
    assert(src1 == src2);
1763
}
1764

    
1765
/* This is almost identical to the previous, end exists only because
1766
 * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1767
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1768
{
1769
#ifdef HAVE_MMX
1770
    asm volatile(
1771
    "mov                  %0, %%"REG_a"         \n\t"
1772
    "1:                                         \n\t"
1773
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1774
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1775
    "psrlw                $8, %%mm0             \n\t"
1776
    "psrlw                $8, %%mm1             \n\t"
1777
    "packuswb          %%mm1, %%mm0             \n\t"
1778
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1779
    "add                  $8, %%"REG_a"         \n\t"
1780
    " js                  1b                    \n\t"
1781
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1782
    : "%"REG_a
1783
    );
1784
#else
1785
    int i;
1786
    for (i=0; i<width; i++)
1787
        dst[i]= src[2*i+1];
1788
#endif
1789
}
1790

    
1791
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1792
{
1793
#ifdef HAVE_MMX
1794
    asm volatile(
1795
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1796
    "mov                    %0, %%"REG_a"       \n\t"
1797
    "1:                                         \n\t"
1798
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1799
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1800
    "pand                %%mm4, %%mm0           \n\t"
1801
    "pand                %%mm4, %%mm1           \n\t"
1802
    "packuswb            %%mm1, %%mm0           \n\t"
1803
    "movq                %%mm0, %%mm1           \n\t"
1804
    "psrlw                  $8, %%mm0           \n\t"
1805
    "pand                %%mm4, %%mm1           \n\t"
1806
    "packuswb            %%mm0, %%mm0           \n\t"
1807
    "packuswb            %%mm1, %%mm1           \n\t"
1808
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1809
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1810
    "add                    $4, %%"REG_a"       \n\t"
1811
    " js                    1b                  \n\t"
1812
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1813
    : "%"REG_a
1814
    );
1815
#else
1816
    int i;
1817
    for (i=0; i<width; i++)
1818
    {
1819
        dstU[i]= src1[4*i + 0];
1820
        dstV[i]= src1[4*i + 2];
1821
    }
1822
#endif
1823
    assert(src1 == src2);
1824
}
1825

    
1826
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1827
{
1828
    int i;
1829
    for (i=0; i<width; i++)
1830
    {
1831
        int b=  ((uint32_t*)src)[i]&0xFF;
1832
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
1833
        int r= (((uint32_t*)src)[i]>>16)&0xFF;
1834

    
1835
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1836
    }
1837
}
1838

    
1839
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1840
{
1841
    int i;
1842
    assert(src1 == src2);
1843
    for (i=0; i<width; i++)
1844
    {
1845
        const int a= ((uint32_t*)src1)[2*i+0];
1846
        const int e= ((uint32_t*)src1)[2*i+1];
1847
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
1848
        const int h= (a&0x00FF00) + (e&0x00FF00);
1849
        const int b=  l&0x3FF;
1850
        const int g=  h>>8;
1851
        const int r=  l>>16;
1852

    
1853
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1855
    }
1856
}
1857

    
1858
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1859
{
1860
#ifdef HAVE_MMX
1861
    asm volatile(
1862
    "mov                        %2, %%"REG_a"   \n\t"
1863
    "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
1864
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1865
    "pxor                    %%mm7, %%mm7       \n\t"
1866
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1867
    ASMALIGN(4)
1868
    "1:                                         \n\t"
1869
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1870
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1871
    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1872
    "punpcklbw               %%mm7, %%mm0       \n\t"
1873
    "punpcklbw               %%mm7, %%mm1       \n\t"
1874
    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1875
    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1876
    "punpcklbw               %%mm7, %%mm2       \n\t"
1877
    "punpcklbw               %%mm7, %%mm3       \n\t"
1878
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1879
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1880
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1881
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1882
#ifndef FAST_BGR2YV12
1883
    "psrad                      $8, %%mm0       \n\t"
1884
    "psrad                      $8, %%mm1       \n\t"
1885
    "psrad                      $8, %%mm2       \n\t"
1886
    "psrad                      $8, %%mm3       \n\t"
1887
#endif
1888
    "packssdw                %%mm1, %%mm0       \n\t"
1889
    "packssdw                %%mm3, %%mm2       \n\t"
1890
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1891
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1892
    "packssdw                %%mm2, %%mm0       \n\t"
1893
    "psraw                      $7, %%mm0       \n\t"
1894

    
1895
    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1896
    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1897
    "punpcklbw               %%mm7, %%mm4       \n\t"
1898
    "punpcklbw               %%mm7, %%mm1       \n\t"
1899
    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1900
    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1901
    "punpcklbw               %%mm7, %%mm2       \n\t"
1902
    "punpcklbw               %%mm7, %%mm3       \n\t"
1903
    "pmaddwd                 %%mm6, %%mm4       \n\t"
1904
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1905
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1906
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1907
#ifndef FAST_BGR2YV12
1908
    "psrad                      $8, %%mm4       \n\t"
1909
    "psrad                      $8, %%mm1       \n\t"
1910
    "psrad                      $8, %%mm2       \n\t"
1911
    "psrad                      $8, %%mm3       \n\t"
1912
#endif
1913
    "packssdw                %%mm1, %%mm4       \n\t"
1914
    "packssdw                %%mm3, %%mm2       \n\t"
1915
    "pmaddwd                 %%mm5, %%mm4       \n\t"
1916
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1917
    "add                       $24, %%"REG_d"   \n\t"
1918
    "packssdw                %%mm2, %%mm4       \n\t"
1919
    "psraw                      $7, %%mm4       \n\t"
1920

    
1921
    "packuswb                %%mm4, %%mm0       \n\t"
1922
    "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1923

    
1924
    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
1925
    "add                        $8, %%"REG_a"   \n\t"
1926
    " js                        1b              \n\t"
1927
    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1928
    : "%"REG_a, "%"REG_d
1929
    );
1930
#else
1931
    int i;
1932
    for (i=0; i<width; i++)
1933
    {
1934
        int b= src[i*3+0];
1935
        int g= src[i*3+1];
1936
        int r= src[i*3+2];
1937

    
1938
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1939
    }
1940
#endif /* HAVE_MMX */
1941
}
1942

    
1943
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1944
{
1945
#ifdef HAVE_MMX
1946
    asm volatile(
1947
    "mov                        %3, %%"REG_a"   \n\t"
1948
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1949
    "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
1950
    "pxor                    %%mm7, %%mm7       \n\t"
1951
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1952
    "add                 %%"REG_d", %%"REG_d"   \n\t"
1953
    ASMALIGN(4)
1954
    "1:                                         \n\t"
1955
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1956
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1957
    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1958
    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1959
    "movq                    %%mm0, %%mm1       \n\t"
1960
    "movq                    %%mm2, %%mm3       \n\t"
1961
    "psrlq                     $24, %%mm0       \n\t"
1962
    "psrlq                     $24, %%mm2       \n\t"
1963
    PAVGB(%%mm1, %%mm0)
1964
    PAVGB(%%mm3, %%mm2)
1965
    "punpcklbw               %%mm7, %%mm0       \n\t"
1966
    "punpcklbw               %%mm7, %%mm2       \n\t"
1967
#else
1968
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1969
    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1970
    "punpcklbw               %%mm7, %%mm0       \n\t"
1971
    "punpcklbw               %%mm7, %%mm2       \n\t"
1972
    "paddw                   %%mm2, %%mm0       \n\t"
1973
    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1974
    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1975
    "punpcklbw               %%mm7, %%mm4       \n\t"
1976
    "punpcklbw               %%mm7, %%mm2       \n\t"
1977
    "paddw                   %%mm4, %%mm2       \n\t"
1978
    "psrlw                      $1, %%mm0       \n\t"
1979
    "psrlw                      $1, %%mm2       \n\t"
1980
#endif
1981
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
1982
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
1983

    
1984
    "pmaddwd                 %%mm0, %%mm1       \n\t"
1985
    "pmaddwd                 %%mm2, %%mm3       \n\t"
1986
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1987
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1988
#ifndef FAST_BGR2YV12
1989
    "psrad                      $8, %%mm0       \n\t"
1990
    "psrad                      $8, %%mm1       \n\t"
1991
    "psrad                      $8, %%mm2       \n\t"
1992
    "psrad                      $8, %%mm3       \n\t"
1993
#endif
1994
    "packssdw                %%mm2, %%mm0       \n\t"
1995
    "packssdw                %%mm3, %%mm1       \n\t"
1996
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1997
    "pmaddwd                 %%mm5, %%mm1       \n\t"
1998
    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1999
    "psraw                      $7, %%mm0       \n\t"
2000

    
2001
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2002
    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
2003
    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
2004
    "movq                   %%mm4, %%mm1       \n\t"
2005
    "movq                   %%mm2, %%mm3       \n\t"
2006
    "psrlq                    $24, %%mm4       \n\t"
2007
    "psrlq                    $24, %%mm2       \n\t"
2008
    PAVGB(%%mm1, %%mm4)
2009
    PAVGB(%%mm3, %%mm2)
2010
    "punpcklbw              %%mm7, %%mm4       \n\t"
2011
    "punpcklbw              %%mm7, %%mm2       \n\t"
2012
#else
2013
    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
2014
    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
2015
    "punpcklbw              %%mm7, %%mm4       \n\t"
2016
    "punpcklbw              %%mm7, %%mm2       \n\t"
2017
    "paddw                  %%mm2, %%mm4       \n\t"
2018
    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
2019
    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
2020
    "punpcklbw              %%mm7, %%mm5       \n\t"
2021
    "punpcklbw              %%mm7, %%mm2       \n\t"
2022
    "paddw                  %%mm5, %%mm2       \n\t"
2023
    "movq      "MANGLE(ff_w1111)", %%mm5       \n\t"
2024
    "psrlw                     $2, %%mm4       \n\t"
2025
    "psrlw                     $2, %%mm2       \n\t"
2026
#endif
2027
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2028
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2029

    
2030
    "pmaddwd                %%mm4, %%mm1       \n\t"
2031
    "pmaddwd                %%mm2, %%mm3       \n\t"
2032
    "pmaddwd                %%mm6, %%mm4       \n\t"
2033
    "pmaddwd                %%mm6, %%mm2       \n\t"
2034
#ifndef FAST_BGR2YV12
2035
    "psrad                     $8, %%mm4       \n\t"
2036
    "psrad                     $8, %%mm1       \n\t"
2037
    "psrad                     $8, %%mm2       \n\t"
2038
    "psrad                     $8, %%mm3       \n\t"
2039
#endif
2040
    "packssdw               %%mm2, %%mm4       \n\t"
2041
    "packssdw               %%mm3, %%mm1       \n\t"
2042
    "pmaddwd                %%mm5, %%mm4       \n\t"
2043
    "pmaddwd                %%mm5, %%mm1       \n\t"
2044
    "add                      $24, %%"REG_d"   \n\t"
2045
    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2046
    "psraw                     $7, %%mm4       \n\t"
2047

    
2048
    "movq                   %%mm0, %%mm1       \n\t"
2049
    "punpckldq              %%mm4, %%mm0       \n\t"
2050
    "punpckhdq              %%mm4, %%mm1       \n\t"
2051
    "packsswb               %%mm1, %%mm0       \n\t"
2052
    "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0    \n\t"
2053

    
2054
    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
2055
    "punpckhdq              %%mm0, %%mm0            \n\t"
2056
    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
2057
    "add                       $4, %%"REG_a"        \n\t"
2058
    " js                       1b                   \n\t"
2059
    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2060
    : "%"REG_a, "%"REG_d
2061
    );
2062
#else
2063
    int i;
2064
    for (i=0; i<width; i++)
2065
    {
2066
        int b= src1[6*i + 0] + src1[6*i + 3];
2067
        int g= src1[6*i + 1] + src1[6*i + 4];
2068
        int r= src1[6*i + 2] + src1[6*i + 5];
2069

    
2070
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2072
    }
2073
#endif /* HAVE_MMX */
2074
    assert(src1 == src2);
2075
}
2076

    
2077
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2078
{
2079
    int i;
2080
    for (i=0; i<width; i++)
2081
    {
2082
        int d= ((uint16_t*)src)[i];
2083
        int b= d&0x1F;
2084
        int g= (d>>5)&0x3F;
2085
        int r= (d>>11)&0x1F;
2086

    
2087
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2088
    }
2089
}
2090

    
2091
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2092
{
2093
    int i;
2094
    assert(src1==src2);
2095
    for (i=0; i<width; i++)
2096
    {
2097
        int d0= ((uint32_t*)src1)[i];
2098

    
2099
        int dl= (d0&0x07E0F81F);
2100
        int dh= ((d0>>5)&0x07C0F83F);
2101

    
2102
        int dh2= (dh>>11) + (dh<<21);
2103
        int d= dh2 + dl;
2104

    
2105
        int b= d&0x7F;
2106
        int r= (d>>11)&0x7F;
2107
        int g= d>>21;
2108
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2110
    }
2111
}
2112

    
2113
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2114
{
2115
    int i;
2116
    for (i=0; i<width; i++)
2117
    {
2118
        int d= ((uint16_t*)src)[i];
2119
        int b= d&0x1F;
2120
        int g= (d>>5)&0x1F;
2121
        int r= (d>>10)&0x1F;
2122

    
2123
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2124
    }
2125
}
2126

    
2127
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2128
{
2129
    int i;
2130
    assert(src1==src2);
2131
    for (i=0; i<width; i++)
2132
    {
2133
        int d0= ((uint32_t*)src1)[i];
2134

    
2135
        int dl= (d0&0x03E07C1F);
2136
        int dh= ((d0>>5)&0x03E0F81F);
2137

    
2138
        int dh2= (dh>>11) + (dh<<21);
2139
        int d= dh2 + dl;
2140

    
2141
        int b= d&0x7F;
2142
        int r= (d>>10)&0x7F;
2143
        int g= d>>21;
2144
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2146
    }
2147
}
2148

    
2149

    
2150
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2151
{
2152
    int i;
2153
    for (i=0; i<width; i++)
2154
    {
2155
        int r=  ((uint32_t*)src)[i]&0xFF;
2156
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
2157
        int b= (((uint32_t*)src)[i]>>16)&0xFF;
2158

    
2159
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2160
    }
2161
}
2162

    
2163
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2164
{
2165
    int i;
2166
    assert(src1==src2);
2167
    for (i=0; i<width; i++)
2168
    {
2169
        const int a= ((uint32_t*)src1)[2*i+0];
2170
        const int e= ((uint32_t*)src1)[2*i+1];
2171
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
2172
        const int h= (a&0x00FF00) + (e&0x00FF00);
2173
        const int r=  l&0x3FF;
2174
        const int g=  h>>8;
2175
        const int b=  l>>16;
2176

    
2177
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2179
    }
2180
}
2181

    
2182
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2183
{
2184
    int i;
2185
    for (i=0; i<width; i++)
2186
    {
2187
        int r= src[i*3+0];
2188
        int g= src[i*3+1];
2189
        int b= src[i*3+2];
2190

    
2191
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2192
    }
2193
}
2194

    
2195
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2196
{
2197
    int i;
2198
    assert(src1==src2);
2199
    for (i=0; i<width; i++)
2200
    {
2201
        int r= src1[6*i + 0] + src1[6*i + 3];
2202
        int g= src1[6*i + 1] + src1[6*i + 4];
2203
        int b= src1[6*i + 2] + src1[6*i + 5];
2204

    
2205
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2207
    }
2208
}
2209

    
2210
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2211
{
2212
    int i;
2213
    for (i=0; i<width; i++)
2214
    {
2215
        int d= ((uint16_t*)src)[i];
2216
        int r= d&0x1F;
2217
        int g= (d>>5)&0x3F;
2218
        int b= (d>>11)&0x1F;
2219

    
2220
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2221
    }
2222
}
2223

    
2224
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2225
{
2226
    int i;
2227
    assert(src1 == src2);
2228
    for (i=0; i<width; i++)
2229
    {
2230
        int d0= ((uint32_t*)src1)[i];
2231

    
2232
        int dl= (d0&0x07E0F81F);
2233
        int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2234

    
2235
        int r= d&0x3F;
2236
        int b= (d>>11)&0x3F;
2237
        int g= d>>21;
2238
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2239
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2240
    }
2241
}
2242

    
2243
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2244
{
2245
    int i;
2246
    for (i=0; i<width; i++)
2247
    {
2248
        int d= ((uint16_t*)src)[i];
2249
        int r= d&0x1F;
2250
        int g= (d>>5)&0x1F;
2251
        int b= (d>>10)&0x1F;
2252

    
2253
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2254
    }
2255
}
2256

    
2257
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2258
{
2259
    int i;
2260
    assert(src1 == src2);
2261
    for (i=0; i<width; i++)
2262
    {
2263
        int d0= ((uint32_t*)src1)[i];
2264

    
2265
        int dl= (d0&0x03E07C1F);
2266
        int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2267

    
2268
        int r= d&0x3F;
2269
        int b= (d>>10)&0x3F;
2270
        int g= d>>21;
2271
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2272
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2273
    }
2274
}
2275

    
2276
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2277
{
2278
    int i;
2279
    for (i=0; i<width; i++)
2280
    {
2281
        int d= src[i];
2282

    
2283
        dst[i]= pal[d] & 0xFF;
2284
    }
2285
}
2286

    
2287
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2288
{
2289
    int i;
2290
    assert(src1 == src2);
2291
    for (i=0; i<width; i++)
2292
    {
2293
        int p= pal[src1[i]];
2294

    
2295
        dstU[i]= p>>8;
2296
        dstV[i]= p>>16;
2297
    }
2298
}
2299

    
2300
// bilinear / bicubic scaling
2301
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2302
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2303
{
2304
#ifdef HAVE_MMX
2305
    assert(filterSize % 4 == 0 && filterSize>0);
2306
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2307
    {
2308
        long counter= -2*dstW;
2309
        filter-= counter*2;
2310
        filterPos-= counter/2;
2311
        dst-= counter/2;
2312
        asm volatile(
2313
#if defined(PIC)
2314
        "push            %%"REG_b"              \n\t"
2315
#endif
2316
        "pxor                %%mm7, %%mm7       \n\t"
2317
        "movq        "MANGLE(w02)", %%mm6       \n\t"
2318
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2319
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2320
        ASMALIGN(4)
2321
        "1:                                     \n\t"
2322
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2323
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2324
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2325
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2326
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2327
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2328
        "punpcklbw           %%mm7, %%mm0       \n\t"
2329
        "punpcklbw           %%mm7, %%mm2       \n\t"
2330
        "pmaddwd             %%mm1, %%mm0       \n\t"
2331
        "pmaddwd             %%mm2, %%mm3       \n\t"
2332
        "psrad                  $8, %%mm0       \n\t"
2333
        "psrad                  $8, %%mm3       \n\t"
2334
        "packssdw            %%mm3, %%mm0       \n\t"
2335
        "pmaddwd             %%mm6, %%mm0       \n\t"
2336
        "packssdw            %%mm0, %%mm0       \n\t"
2337
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2338
        "add                    $4, %%"REG_BP"  \n\t"
2339
        " jnc                   1b              \n\t"
2340

    
2341
        "pop            %%"REG_BP"              \n\t"
2342
#if defined(PIC)
2343
        "pop             %%"REG_b"              \n\t"
2344
#endif
2345
        : "+a" (counter)
2346
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2347
#if !defined(PIC)
2348
        : "%"REG_b
2349
#endif
2350
        );
2351
    }
2352
    else if (filterSize==8)
2353
    {
2354
        long counter= -2*dstW;
2355
        filter-= counter*4;
2356
        filterPos-= counter/2;
2357
        dst-= counter/2;
2358
        asm volatile(
2359
#if defined(PIC)
2360
        "push             %%"REG_b"             \n\t"
2361
#endif
2362
        "pxor                 %%mm7, %%mm7      \n\t"
2363
        "movq         "MANGLE(w02)", %%mm6      \n\t"
2364
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2365
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2366
        ASMALIGN(4)
2367
        "1:                                     \n\t"
2368
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2369
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2370
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2371
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2372
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2373
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2374
        "punpcklbw            %%mm7, %%mm0      \n\t"
2375
        "punpcklbw            %%mm7, %%mm2      \n\t"
2376
        "pmaddwd              %%mm1, %%mm0      \n\t"
2377
        "pmaddwd              %%mm2, %%mm3      \n\t"
2378

    
2379
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2380
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2381
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2382
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2383
        "punpcklbw            %%mm7, %%mm4      \n\t"
2384
        "punpcklbw            %%mm7, %%mm2      \n\t"
2385
        "pmaddwd              %%mm1, %%mm4      \n\t"
2386
        "pmaddwd              %%mm2, %%mm5      \n\t"
2387
        "paddd                %%mm4, %%mm0      \n\t"
2388
        "paddd                %%mm5, %%mm3      \n\t"
2389

    
2390
        "psrad                   $8, %%mm0      \n\t"
2391
        "psrad                   $8, %%mm3      \n\t"
2392
        "packssdw             %%mm3, %%mm0      \n\t"
2393
        "pmaddwd              %%mm6, %%mm0      \n\t"
2394
        "packssdw             %%mm0, %%mm0      \n\t"
2395
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2396
        "add                     $4, %%"REG_BP" \n\t"
2397
        " jnc                    1b             \n\t"
2398

    
2399
        "pop             %%"REG_BP"             \n\t"
2400
#if defined(PIC)
2401
        "pop              %%"REG_b"             \n\t"
2402
#endif
2403
        : "+a" (counter)
2404
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2405
#if !defined(PIC)
2406
        : "%"REG_b
2407
#endif
2408
        );
2409
    }
2410
    else
2411
    {
2412
        uint8_t *offset = src+filterSize;
2413
        long counter= -2*dstW;
2414
        //filter-= counter*filterSize/2;
2415
        filterPos-= counter/2;
2416
        dst-= counter/2;
2417
        asm volatile(
2418
        "pxor                  %%mm7, %%mm7     \n\t"
2419
        "movq          "MANGLE(w02)", %%mm6     \n\t"
2420
        ASMALIGN(4)
2421
        "1:                                     \n\t"
2422
        "mov                      %2, %%"REG_c" \n\t"
2423
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2424
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2425
        "mov                      %5, %%"REG_c" \n\t"
2426
        "pxor                  %%mm4, %%mm4     \n\t"
2427
        "pxor                  %%mm5, %%mm5     \n\t"
2428
        "2:                                     \n\t"
2429
        "movq                   (%1), %%mm1     \n\t"
2430
        "movq               (%1, %6), %%mm3     \n\t"
2431
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2432
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2433
        "punpcklbw             %%mm7, %%mm0     \n\t"
2434
        "punpcklbw             %%mm7, %%mm2     \n\t"
2435
        "pmaddwd               %%mm1, %%mm0     \n\t"
2436
        "pmaddwd               %%mm2, %%mm3     \n\t"
2437
        "paddd                 %%mm3, %%mm5     \n\t"
2438
        "paddd                 %%mm0, %%mm4     \n\t"
2439
        "add                      $8, %1        \n\t"
2440
        "add                      $4, %%"REG_c" \n\t"
2441
        "cmp                      %4, %%"REG_c" \n\t"
2442
        " jb                      2b            \n\t"
2443
        "add                      %6, %1        \n\t"
2444
        "psrad                    $8, %%mm4     \n\t"
2445
        "psrad                    $8, %%mm5     \n\t"
2446
        "packssdw              %%mm5, %%mm4     \n\t"
2447
        "pmaddwd               %%mm6, %%mm4     \n\t"
2448
        "packssdw              %%mm4, %%mm4     \n\t"
2449
        "mov                      %3, %%"REG_a" \n\t"
2450
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2451
        "add                      $4, %0        \n\t"
2452
        " jnc                     1b            \n\t"
2453

    
2454
        : "+r" (counter), "+r" (filter)
2455
        : "m" (filterPos), "m" (dst), "m"(offset),
2456
          "m" (src), "r" (filterSize*2)
2457
        : "%"REG_a, "%"REG_c, "%"REG_d
2458
        );
2459
    }
2460
#else
2461
#ifdef HAVE_ALTIVEC
2462
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2463
#else
2464
    int i;
2465
    for (i=0; i<dstW; i++)
2466
    {
2467
        int j;
2468
        int srcPos= filterPos[i];
2469
        int val=0;
2470
        //printf("filterPos: %d\n", filterPos[i]);
2471
        for (j=0; j<filterSize; j++)
2472
        {
2473
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2474
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2475
        }
2476
        //filter += hFilterSize;
2477
        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2478
        //dst[i] = val>>7;
2479
    }
2480
#endif /* HAVE_ALTIVEC */
2481
#endif /* HAVE_MMX */
2482
}
2483
      // *** horizontal scale Y line to temp buffer
2484
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2485
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2486
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2487
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2488
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2489
{
2490
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2491
    {
2492
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2493
        src= formatConvBuffer;
2494
    }
2495
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2496
    {
2497
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2498
        src= formatConvBuffer;
2499
    }
2500
    else if (srcFormat==PIX_FMT_RGB32)
2501
    {
2502
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2503
        src= formatConvBuffer;
2504
    }
2505
    else if (srcFormat==PIX_FMT_BGR24)
2506
    {
2507
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2508
        src= formatConvBuffer;
2509
    }
2510
    else if (srcFormat==PIX_FMT_BGR565)
2511
    {
2512
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2513
        src= formatConvBuffer;
2514
    }
2515
    else if (srcFormat==PIX_FMT_BGR555)
2516
    {
2517
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2518
        src= formatConvBuffer;
2519
    }
2520
    else if (srcFormat==PIX_FMT_BGR32)
2521
    {
2522
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2523
        src= formatConvBuffer;
2524
    }
2525
    else if (srcFormat==PIX_FMT_RGB24)
2526
    {
2527
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2528
        src= formatConvBuffer;
2529
    }
2530
    else if (srcFormat==PIX_FMT_RGB565)
2531
    {
2532
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2533
        src= formatConvBuffer;
2534
    }
2535
    else if (srcFormat==PIX_FMT_RGB555)
2536
    {
2537
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2538
        src= formatConvBuffer;
2539
    }
2540
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2541
    {
2542
        RENAME(palToY)(formatConvBuffer, src, srcW, (uint32_t*)pal);
2543
        src= formatConvBuffer;
2544
    }
2545

    
2546
#ifdef HAVE_MMX
2547
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2548
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2549
#else
2550
    if (!(flags&SWS_FAST_BILINEAR))
2551
#endif
2552
    {
2553
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2554
    }
2555
    else // fast bilinear upscale / crap downscale
2556
    {
2557
#if defined(ARCH_X86)
2558
#ifdef HAVE_MMX2
2559
        int i;
2560
#if defined(PIC)
2561
        uint64_t ebxsave __attribute__((aligned(8)));
2562
#endif
2563
        if (canMMX2BeUsed)
2564
        {
2565
            asm volatile(
2566
#if defined(PIC)
2567
            "mov               %%"REG_b", %5        \n\t"
2568
#endif
2569
            "pxor                  %%mm7, %%mm7     \n\t"
2570
            "mov                      %0, %%"REG_c" \n\t"
2571
            "mov                      %1, %%"REG_D" \n\t"
2572
            "mov                      %2, %%"REG_d" \n\t"
2573
            "mov                      %3, %%"REG_b" \n\t"
2574
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2575
            PREFETCH"        (%%"REG_c")            \n\t"
2576
            PREFETCH"      32(%%"REG_c")            \n\t"
2577
            PREFETCH"      64(%%"REG_c")            \n\t"
2578

    
2579
#ifdef ARCH_X86_64
2580

    
2581
#define FUNNY_Y_CODE \
2582
            "movl            (%%"REG_b"), %%esi     \n\t"\
2583
            "call                    *%4            \n\t"\
2584
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2585
            "add               %%"REG_S", %%"REG_c" \n\t"\
2586
            "add               %%"REG_a", %%"REG_D" \n\t"\
2587
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2588

    
2589
#else
2590

    
2591
#define FUNNY_Y_CODE \
2592
            "movl (%%"REG_b"), %%esi        \n\t"\
2593
            "call         *%4                       \n\t"\
2594
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2595
            "add               %%"REG_a", %%"REG_D" \n\t"\
2596
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2597

    
2598
#endif /* ARCH_X86_64 */
2599

    
2600
FUNNY_Y_CODE
2601
FUNNY_Y_CODE
2602
FUNNY_Y_CODE
2603
FUNNY_Y_CODE
2604
FUNNY_Y_CODE
2605
FUNNY_Y_CODE
2606
FUNNY_Y_CODE
2607
FUNNY_Y_CODE
2608

    
2609
#if defined(PIC)
2610
            "mov                      %5, %%"REG_b" \n\t"
2611
#endif
2612
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2613
            "m" (funnyYCode)
2614
#if defined(PIC)
2615
            ,"m" (ebxsave)
2616
#endif
2617
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2618
#if !defined(PIC)
2619
            ,"%"REG_b
2620
#endif
2621
            );
2622
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2623
        }
2624
        else
2625
        {
2626
#endif /* HAVE_MMX2 */
2627
        long xInc_shr16 = xInc >> 16;
2628
        uint16_t xInc_mask = xInc & 0xffff;
2629
        //NO MMX just normal asm ...
2630
        asm volatile(
2631
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2632
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2633
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2634
        ASMALIGN(4)
2635
        "1:                                  \n\t"
2636
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2637
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2638
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2639
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2640
        "shll      $16, %%edi                \n\t"
2641
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2642
        "mov        %1, %%"REG_D"            \n\t"
2643
        "shrl       $9, %%esi                \n\t"
2644
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2645
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2646
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2647

    
2648
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2649
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2650
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2651
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2652
        "shll      $16, %%edi                \n\t"
2653
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2654
        "mov        %1, %%"REG_D"            \n\t"
2655
        "shrl       $9, %%esi                \n\t"
2656
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2657
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2658
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2659

    
2660

    
2661
        "add        $2, %%"REG_a"            \n\t"
2662
        "cmp        %2, %%"REG_a"            \n\t"
2663
        " jb        1b                       \n\t"
2664

    
2665

    
2666
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2667
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2668
        );
2669
#ifdef HAVE_MMX2
2670
        } //if MMX2 can't be used
2671
#endif
2672
#else
2673
        int i;
2674
        unsigned int xpos=0;
2675
        for (i=0;i<dstWidth;i++)
2676
        {
2677
            register unsigned int xx=xpos>>16;
2678
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2679
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2680
            xpos+=xInc;
2681
        }
2682
#endif /* defined(ARCH_X86) */
2683
    }
2684
}
2685

    
2686
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2687
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2688
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2689
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2690
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2691
{
2692
    if (srcFormat==PIX_FMT_YUYV422)
2693
    {
2694
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2695
        src1= formatConvBuffer;
2696
        src2= formatConvBuffer+VOFW;
2697
    }
2698
    else if (srcFormat==PIX_FMT_UYVY422)
2699
    {
2700
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2701
        src1= formatConvBuffer;
2702
        src2= formatConvBuffer+VOFW;
2703
    }
2704
    else if (srcFormat==PIX_FMT_RGB32)
2705
    {
2706
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2707
        src1= formatConvBuffer;
2708
        src2= formatConvBuffer+VOFW;
2709
    }
2710
    else if (srcFormat==PIX_FMT_BGR24)
2711
    {
2712
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2713
        src1= formatConvBuffer;
2714
        src2= formatConvBuffer+VOFW;
2715
    }
2716
    else if (srcFormat==PIX_FMT_BGR565)
2717
    {
2718
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2719
        src1= formatConvBuffer;
2720
        src2= formatConvBuffer+VOFW;
2721
    }
2722
    else if (srcFormat==PIX_FMT_BGR555)
2723
    {
2724
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2725
        src1= formatConvBuffer;
2726
        src2= formatConvBuffer+VOFW;
2727
    }
2728
    else if (srcFormat==PIX_FMT_BGR32)
2729
    {
2730
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2731
        src1= formatConvBuffer;
2732
        src2= formatConvBuffer+VOFW;
2733
    }
2734
    else if (srcFormat==PIX_FMT_RGB24)
2735
    {
2736
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2737
        src1= formatConvBuffer;
2738
        src2= formatConvBuffer+VOFW;
2739
    }
2740
    else if (srcFormat==PIX_FMT_RGB565)
2741
    {
2742
        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2743
        src1= formatConvBuffer;
2744
        src2= formatConvBuffer+VOFW;
2745
    }
2746
    else if (srcFormat==PIX_FMT_RGB555)
2747
    {
2748
        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW);
2749
        src1= formatConvBuffer;
2750
        src2= formatConvBuffer+VOFW;
2751
    }
2752
    else if (isGray(srcFormat))
2753
    {
2754
        return;
2755
    }
2756
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2757
    {
2758
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+VOFW, src1, src2, srcW, (uint32_t*)pal);
2759
        src1= formatConvBuffer;
2760
        src2= formatConvBuffer+VOFW;
2761
    }
2762

    
2763
#ifdef HAVE_MMX
2764
    // Use the new MMX scaler if the MMX2 one can't be used (it is faster than the x86 ASM one).
2765
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2766
#else
2767
    if (!(flags&SWS_FAST_BILINEAR))
2768
#endif
2769
    {
2770
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771
        RENAME(hScale)(dst+VOFW, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2772
    }
2773
    else // fast bilinear upscale / crap downscale
2774
    {
2775
#if defined(ARCH_X86)
2776
#ifdef HAVE_MMX2
2777
        int i;
2778
#if defined(PIC)
2779
        uint64_t ebxsave __attribute__((aligned(8)));
2780
#endif
2781
        if (canMMX2BeUsed)
2782
        {
2783
            asm volatile(
2784
#if defined(PIC)
2785
            "mov          %%"REG_b", %6         \n\t"
2786
#endif
2787
            "pxor             %%mm7, %%mm7      \n\t"
2788
            "mov                 %0, %%"REG_c"  \n\t"
2789
            "mov                 %1, %%"REG_D"  \n\t"
2790
            "mov                 %2, %%"REG_d"  \n\t"
2791
            "mov                 %3, %%"REG_b"  \n\t"
2792
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2793
            PREFETCH"   (%%"REG_c")             \n\t"
2794
            PREFETCH" 32(%%"REG_c")             \n\t"
2795
            PREFETCH" 64(%%"REG_c")             \n\t"
2796

    
2797
#ifdef ARCH_X86_64
2798

    
2799
#define FUNNY_UV_CODE \
2800
            "movl       (%%"REG_b"), %%esi      \n\t"\
2801
            "call               *%4             \n\t"\
2802
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2803
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2804
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2805
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2806

    
2807
#else
2808

    
2809
#define FUNNY_UV_CODE \
2810
            "movl       (%%"REG_b"), %%esi      \n\t"\
2811
            "call               *%4             \n\t"\
2812
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2813
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2814
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2815

    
2816
#endif /* ARCH_X86_64 */
2817

    
2818
FUNNY_UV_CODE
2819
FUNNY_UV_CODE
2820
FUNNY_UV_CODE
2821
FUNNY_UV_CODE
2822
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2823
            "mov                 %5, %%"REG_c"  \n\t" // src
2824
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2825
            "add              $"AV_STRINGIFY(VOF)", %%"REG_D"  \n\t"
2826
            PREFETCH"   (%%"REG_c")             \n\t"
2827
            PREFETCH" 32(%%"REG_c")             \n\t"
2828
            PREFETCH" 64(%%"REG_c")             \n\t"
2829

    
2830
FUNNY_UV_CODE
2831
FUNNY_UV_CODE
2832
FUNNY_UV_CODE
2833
FUNNY_UV_CODE
2834

    
2835
#if defined(PIC)
2836
            "mov %6, %%"REG_b"    \n\t"
2837
#endif
2838
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2839
            "m" (funnyUVCode), "m" (src2)
2840
#if defined(PIC)
2841
            ,"m" (ebxsave)
2842
#endif
2843
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2844
#if !defined(PIC)
2845
             ,"%"REG_b
2846
#endif
2847
            );
2848
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2849
            {
2850
                //printf("%d %d %d\n", dstWidth, i, srcW);
2851
                dst[i] = src1[srcW-1]*128;
2852
                dst[i+VOFW] = src2[srcW-1]*128;
2853
            }
2854
        }
2855
        else
2856
        {
2857
#endif /* HAVE_MMX2 */
2858
            long xInc_shr16 = (long) (xInc >> 16);
2859
            uint16_t xInc_mask = xInc & 0xffff;
2860
            asm volatile(
2861
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2862
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2863
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2864
            ASMALIGN(4)
2865
            "1:                                     \n\t"
2866
            "mov        %0, %%"REG_S"               \n\t"
2867
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2868
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2869
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2870
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2871
            "shll      $16, %%edi                   \n\t"
2872
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2873
            "mov        %1, %%"REG_D"               \n\t"
2874
            "shrl       $9, %%esi                   \n\t"
2875
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2876

    
2877
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2878
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2879
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2880
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2881
            "shll      $16, %%edi                   \n\t"
2882
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2883
            "mov        %1, %%"REG_D"               \n\t"
2884
            "shrl       $9, %%esi                   \n\t"
2885
            "movw     %%si, "AV_STRINGIFY(VOF)"(%%"REG_D", %%"REG_a", 2)   \n\t"
2886

    
2887
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2888
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2889
            "add        $1, %%"REG_a"               \n\t"
2890
            "cmp        %2, %%"REG_a"               \n\t"
2891
            " jb        1b                          \n\t"
2892

    
2893
/* GCC 3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2894
   which is needed to support GCC 4.0. */
2895
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2896
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2897
#else
2898
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2899
#endif
2900
            "r" (src2)
2901
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2902
            );
2903
#ifdef HAVE_MMX2
2904
        } //if MMX2 can't be used
2905
#endif
2906
#else
2907
        int i;
2908
        unsigned int xpos=0;
2909
        for (i=0;i<dstWidth;i++)
2910
        {
2911
            register unsigned int xx=xpos>>16;
2912
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2913
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2914
            dst[i+VOFW]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2915
            /* slower
2916
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2917
            dst[i+VOFW]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2918
            */
2919
            xpos+=xInc;
2920
        }
2921
#endif /* defined(ARCH_X86) */
2922
    }
2923
}
2924

    
2925
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2926
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2927

    
2928
    /* load a few things into local vars to make the code more readable? and faster */
2929
    const int srcW= c->srcW;
2930
    const int dstW= c->dstW;
2931
    const int dstH= c->dstH;
2932
    const int chrDstW= c->chrDstW;
2933
    const int chrSrcW= c->chrSrcW;
2934
    const int lumXInc= c->lumXInc;
2935
    const int chrXInc= c->chrXInc;
2936
    const int dstFormat= c->dstFormat;
2937
    const int srcFormat= c->srcFormat;
2938
    const int flags= c->flags;
2939
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2940
    int16_t *vLumFilterPos= c->vLumFilterPos;
2941
    int16_t *vChrFilterPos= c->vChrFilterPos;
2942
    int16_t *hLumFilterPos= c->hLumFilterPos;
2943
    int16_t *hChrFilterPos= c->hChrFilterPos;
2944
    int16_t *vLumFilter= c->vLumFilter;
2945
    int16_t *vChrFilter= c->vChrFilter;
2946
    int16_t *hLumFilter= c->hLumFilter;
2947
    int16_t *hChrFilter= c->hChrFilter;
2948
    int32_t *lumMmxFilter= c->lumMmxFilter;
2949
    int32_t *chrMmxFilter= c->chrMmxFilter;
2950
    const int vLumFilterSize= c->vLumFilterSize;
2951
    const int vChrFilterSize= c->vChrFilterSize;
2952
    const int hLumFilterSize= c->hLumFilterSize;
2953
    const int hChrFilterSize= c->hChrFilterSize;
2954
    int16_t **lumPixBuf= c->lumPixBuf;
2955
    int16_t **chrPixBuf= c->chrPixBuf;
2956
    const int vLumBufSize= c->vLumBufSize;
2957
    const int vChrBufSize= c->vChrBufSize;
2958
    uint8_t *funnyYCode= c->funnyYCode;
2959
    uint8_t *funnyUVCode= c->funnyUVCode;
2960
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2961
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2962
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2963
    int lastDstY;
2964
    uint8_t *pal=NULL;
2965

    
2966
    /* vars which will change and which we need to store back in the context */
2967
    int dstY= c->dstY;
2968
    int lumBufIndex= c->lumBufIndex;
2969
    int chrBufIndex= c->chrBufIndex;
2970
    int lastInLumBuf= c->lastInLumBuf;
2971
    int lastInChrBuf= c->lastInChrBuf;
2972

    
2973
    if (isPacked(c->srcFormat)){
2974
        pal= src[1];
2975
        src[0]=
2976
        src[1]=
2977
        src[2]= src[0];
2978
        srcStride[0]=
2979
        srcStride[1]=
2980
        srcStride[2]= srcStride[0];
2981
    }
2982
    srcStride[1]<<= c->vChrDrop;
2983
    srcStride[2]<<= c->vChrDrop;
2984

    
2985
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2986
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2987

    
2988
#if 0 //self test FIXME move to a vfilter or something
2989
    {
2990
    static volatile int i=0;
2991
    i++;
2992
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2993
        selfTest(src, srcStride, c->srcW, c->srcH);
2994
    i--;
2995
    }
2996
#endif
2997

    
2998
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2999
    //dstStride[0],dstStride[1],dstStride[2]);
3000

    
3001
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3002
    {
3003
        static int firstTime=1; //FIXME move this into the context perhaps
3004
        if (flags & SWS_PRINT_INFO && firstTime)
3005
        {
3006
            av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
3007
                   "         ->cannot do aligned memory accesses anymore\n");
3008
            firstTime=0;
3009
        }
3010
    }
3011

    
3012
    /* Note the user might start scaling the picture in the middle so this
3013
       will not get executed. This is not really intended but works
3014
       currently, so people might do it. */
3015
    if (srcSliceY ==0){
3016
        lumBufIndex=0;
3017
        chrBufIndex=0;
3018
        dstY=0;
3019
        lastInLumBuf= -1;
3020
        lastInChrBuf= -1;
3021
    }
3022

    
3023
    lastDstY= dstY;
3024

    
3025
    for (;dstY < dstH; dstY++){
3026
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
3027
        const int chrDstY= dstY>>c->chrDstVSubSample;
3028
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3029
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3030

    
3031
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3032
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3033
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3034
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3035

    
3036
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3037
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3038
        //handle holes (FAST_BILINEAR & weird filters)
3039
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3040
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3041
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3042
        ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3043
        ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3044

    
3045
        // Do we have enough lines in this slice to output the dstY line
3046
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3047
        {
3048
            //Do horizontal scaling
3049
            while(lastInLumBuf < lastLumSrcY)
3050
            {
3051
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3052
                lumBufIndex++;
3053
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3054
                ASSERT(lumBufIndex < 2*vLumBufSize)
3055
                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3056
                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3057
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3058
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3059
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3060
                                funnyYCode, c->srcFormat, formatConvBuffer,
3061
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3062
                lastInLumBuf++;
3063
            }
3064
            while(lastInChrBuf < lastChrSrcY)
3065
            {
3066
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3067
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3068
                chrBufIndex++;
3069
                ASSERT(chrBufIndex < 2*vChrBufSize)
3070
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3071
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3072
                //FIXME replace parameters through context struct (some at least)
3073

    
3074
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3075
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3076
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3077
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3078
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3079
                lastInChrBuf++;
3080
            }
3081
            //wrap buf index around to stay inside the ring buffer
3082
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3083
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3084
        }
3085
        else // not enough lines left in this slice -> load the rest in the buffer
3086
        {
3087
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3088
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3089
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3090
            vChrBufSize, vLumBufSize);*/
3091

    
3092
            //Do horizontal scaling
3093
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3094
            {
3095
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3096
                lumBufIndex++;
3097
                ASSERT(lumBufIndex < 2*vLumBufSize)
3098
                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3099
                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3100
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3101
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3102
                                funnyYCode, c->srcFormat, formatConvBuffer,
3103
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3104
                lastInLumBuf++;
3105
            }
3106
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3107
            {
3108
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3109
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3110
                chrBufIndex++;
3111
                ASSERT(chrBufIndex < 2*vChrBufSize)
3112
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3113
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3114

    
3115
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3116
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3117
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3118
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3119
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3120
                lastInChrBuf++;
3121
            }
3122
            //wrap buf index around to stay inside the ring buffer
3123
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3124
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3125
            break; //we can't output a dstY line so let's try with the next slice
3126
        }
3127

    
3128
#ifdef HAVE_MMX
3129
        b5Dither= ff_dither8[dstY&1];
3130
        g6Dither= ff_dither4[dstY&1];
3131
        g5Dither= ff_dither8[dstY&1];
3132
        r5Dither= ff_dither8[(dstY+1)&1];
3133
#endif
3134
        if (dstY < dstH-2)
3135
        {
3136
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3137
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3138
#ifdef HAVE_MMX
3139
            int i;
3140
        if (flags & SWS_ACCURATE_RND){
3141
            for (i=0; i<vLumFilterSize; i+=2){
3142
                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
3143
                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3144
                lumMmxFilter[2*i+2]=
3145
                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3146
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3147
            }
3148
            for (i=0; i<vChrFilterSize; i+=2){
3149
                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
3150
                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3151
                chrMmxFilter[2*i+2]=
3152
                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3153
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3154
            }
3155
        }else{
3156
            for (i=0; i<vLumFilterSize; i++)
3157
            {
3158
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3159
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3160
                lumMmxFilter[4*i+2]=
3161
                lumMmxFilter[4*i+3]=
3162
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3163
            }
3164
            for (i=0; i<vChrFilterSize; i++)
3165
            {
3166
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3167
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3168
                chrMmxFilter[4*i+2]=
3169
                chrMmxFilter[4*i+3]=
3170
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3171
            }
3172
        }
3173
#endif
3174
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3175
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3176
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3177
                RENAME(yuv2nv12X)(c,
3178
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3179
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3180
                    dest, uDest, dstW, chrDstW, dstFormat);
3181
            }
3182
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3183
            {
3184
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3185
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3186
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // unscaled YV12
3187
                {
3188
                    int16_t *lumBuf = lumPixBuf[0];
3189
                    int16_t *chrBuf= chrPixBuf[0];
3190
                    RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3191
                }
3192
                else //General YV12
3193
                {
3194
                    RENAME(yuv2yuvX)(c,
3195
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3196
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3197
                        dest, uDest, vDest, dstW, chrDstW);
3198
                }
3199
            }
3200
            else
3201
            {
3202
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3203
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3204
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //unscaled RGB
3205
                {
3206
                    int chrAlpha= vChrFilter[2*dstY+1];
3207
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3208
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3209
                }
3210
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //bilinear upscale RGB
3211
                {
3212
                    int lumAlpha= vLumFilter[2*dstY+1];
3213
                    int chrAlpha= vChrFilter[2*dstY+1];
3214
                    lumMmxFilter[2]=
3215
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3216
                    chrMmxFilter[2]=
3217
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3218
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3219
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3220
                }
3221
                else //general RGB
3222
                {
3223
                    RENAME(yuv2packedX)(c,
3224
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3225
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3226
                        dest, dstW, dstY);
3227
                }
3228
            }
3229
        }
3230
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3231
        {
3232
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3233
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3234
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3235
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3236
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3237
                yuv2nv12XinC(
3238
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3239
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3240
                    dest, uDest, dstW, chrDstW, dstFormat);
3241
            }
3242
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3243
            {
3244
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3245
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3246
                yuv2yuvXinC(
3247
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3248
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3249
                    dest, uDest, vDest, dstW, chrDstW);
3250
            }
3251
            else
3252
            {
3253
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3254
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3255
                yuv2packedXinC(c,
3256
                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3257
                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3258
                    dest, dstW, dstY);
3259
            }
3260
        }
3261
    }
3262

    
3263
#ifdef HAVE_MMX
3264
    asm volatile(SFENCE:::"memory");
3265
    asm volatile(EMMS:::"memory");
3266
#endif
3267
    /* store changed local vars back in the context */
3268
    c->dstY= dstY;
3269 </