Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ 2da0d70d

History | View | Annotate | Download (132 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * the C code (not assembly, mmx, ...) of this file can be used
21
 * under the LGPL license too
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined ( HAVE_MMX2 )
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                                $16, %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
/*
185
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
       "r" (dest), "m" (dstW),
188
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190
*/
191
#define YSCALEYUV2PACKEDX \
192
    asm volatile(\
193
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
194
    ASMALIGN(4)\
195
    "nop                                            \n\t"\
196
    "1:                                             \n\t"\
197
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
198
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
199
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
200
    "movq                      %%mm3, %%mm4         \n\t"\
201
    ASMALIGN(4)\
202
    "2:                                             \n\t"\
203
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
204
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
205
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
206
    "add                         $16, %%"REG_d"     \n\t"\
207
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
208
    "pmulhw                    %%mm0, %%mm2         \n\t"\
209
    "pmulhw                    %%mm0, %%mm5         \n\t"\
210
    "paddw                     %%mm2, %%mm3         \n\t"\
211
    "paddw                     %%mm5, %%mm4         \n\t"\
212
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
213
    " jnz                         2b                \n\t"\
214
\
215
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
218
    "movq                      %%mm1, %%mm7         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
223
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
224
    "add                         $16, %%"REG_d"            \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm1         \n\t"\
229
    "paddw                     %%mm5, %%mm7         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX_END                 \
234
    :: "r" (&c->redDither),                   \
235
        "m" (dummy), "m" (dummy), "m" (dummy),\
236
        "r" (dest), "m" (dstW)                \
237
    : "%"REG_a, "%"REG_d, "%"REG_S            \
238
    );
239

    
240
#define YSCALEYUV2PACKEDX_ACCURATE \
241
    asm volatile(\
242
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
243
    ASMALIGN(4)\
244
    "nop                                            \n\t"\
245
    "1:                                             \n\t"\
246
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
247
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
248
    "pxor                      %%mm4, %%mm4         \n\t"\
249
    "pxor                      %%mm5, %%mm5         \n\t"\
250
    "pxor                      %%mm6, %%mm6         \n\t"\
251
    "pxor                      %%mm7, %%mm7         \n\t"\
252
    ASMALIGN(4)\
253
    "2:                                             \n\t"\
254
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
255
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
256
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
257
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
258
    "movq                      %%mm0, %%mm3         \n\t"\
259
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
260
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
261
    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
262
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
263
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
264
    "paddd                     %%mm0, %%mm4         \n\t"\
265
    "paddd                     %%mm3, %%mm5         \n\t"\
266
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
267
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
268
    "add                         $16, %%"REG_d"     \n\t"\
269
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
270
    "movq                      %%mm2, %%mm0         \n\t"\
271
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
272
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
273
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
274
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
275
    "paddd                     %%mm2, %%mm6         \n\t"\
276
    "paddd                     %%mm0, %%mm7         \n\t"\
277
    " jnz                         2b                \n\t"\
278
    "psrad                       $16, %%mm4         \n\t"\
279
    "psrad                       $16, %%mm5         \n\t"\
280
    "psrad                       $16, %%mm6         \n\t"\
281
    "psrad                       $16, %%mm7         \n\t"\
282
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
283
    "packssdw                  %%mm5, %%mm4         \n\t"\
284
    "packssdw                  %%mm7, %%mm6         \n\t"\
285
    "paddw                     %%mm0, %%mm4         \n\t"\
286
    "paddw                     %%mm0, %%mm6         \n\t"\
287
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
288
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
289
\
290
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
291
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
292
    "pxor                      %%mm1, %%mm1         \n\t"\
293
    "pxor                      %%mm5, %%mm5         \n\t"\
294
    "pxor                      %%mm7, %%mm7         \n\t"\
295
    "pxor                      %%mm6, %%mm6         \n\t"\
296
    ASMALIGN(4)\
297
    "2:                                             \n\t"\
298
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
299
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
300
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
301
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
302
    "movq                      %%mm0, %%mm3         \n\t"\
303
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
304
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
305
    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
306
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
307
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
308
    "paddd                     %%mm0, %%mm1         \n\t"\
309
    "paddd                     %%mm3, %%mm5         \n\t"\
310
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
311
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
312
    "add                         $16, %%"REG_d"     \n\t"\
313
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
314
    "movq                      %%mm2, %%mm0         \n\t"\
315
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
316
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
317
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
318
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
319
    "paddd                     %%mm2, %%mm7         \n\t"\
320
    "paddd                     %%mm0, %%mm6         \n\t"\
321
    " jnz                         2b                \n\t"\
322
    "psrad                       $16, %%mm1         \n\t"\
323
    "psrad                       $16, %%mm5         \n\t"\
324
    "psrad                       $16, %%mm7         \n\t"\
325
    "psrad                       $16, %%mm6         \n\t"\
326
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
327
    "packssdw                  %%mm5, %%mm1         \n\t"\
328
    "packssdw                  %%mm6, %%mm7         \n\t"\
329
    "paddw                     %%mm0, %%mm1         \n\t"\
330
    "paddw                     %%mm0, %%mm7         \n\t"\
331
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
332
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
333

    
334
#define YSCALEYUV2RGBX \
335
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
336
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
337
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
338
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
339
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
340
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
341
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
343
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
344
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
345
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
346
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
347
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
348
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
    "paddw           %%mm3, %%mm4       \n\t"\
350
    "movq            %%mm2, %%mm0       \n\t"\
351
    "movq            %%mm5, %%mm6       \n\t"\
352
    "movq            %%mm4, %%mm3       \n\t"\
353
    "punpcklwd       %%mm2, %%mm2       \n\t"\
354
    "punpcklwd       %%mm5, %%mm5       \n\t"\
355
    "punpcklwd       %%mm4, %%mm4       \n\t"\
356
    "paddw           %%mm1, %%mm2       \n\t"\
357
    "paddw           %%mm1, %%mm5       \n\t"\
358
    "paddw           %%mm1, %%mm4       \n\t"\
359
    "punpckhwd       %%mm0, %%mm0       \n\t"\
360
    "punpckhwd       %%mm6, %%mm6       \n\t"\
361
    "punpckhwd       %%mm3, %%mm3       \n\t"\
362
    "paddw           %%mm7, %%mm0       \n\t"\
363
    "paddw           %%mm7, %%mm6       \n\t"\
364
    "paddw           %%mm7, %%mm3       \n\t"\
365
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
    "packuswb        %%mm0, %%mm2       \n\t"\
367
    "packuswb        %%mm6, %%mm5       \n\t"\
368
    "packuswb        %%mm3, %%mm4       \n\t"\
369
    "pxor            %%mm7, %%mm7       \n\t"
370
#if 0
371
#define FULL_YSCALEYUV2RGB \
372
    "pxor                 %%mm7, %%mm7  \n\t"\
373
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
374
    "punpcklwd            %%mm6, %%mm6  \n\t"\
375
    "punpcklwd            %%mm6, %%mm6  \n\t"\
376
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
377
    "punpcklwd            %%mm5, %%mm5  \n\t"\
378
    "punpcklwd            %%mm5, %%mm5  \n\t"\
379
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
380
    ASMALIGN(4)\
381
    "1:                                 \n\t"\
382
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
383
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
384
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
385
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
386
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
387
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
    "movq 4096(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
392
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
    "movq 4096(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
395
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
398
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
399
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
400
\
401
\
402
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
404
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
405
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
407
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
409
\
410
\
411
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
412
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
413
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
414
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
415
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
416
    "packuswb             %%mm3, %%mm3  \n\t"\
417
\
418
    "packuswb             %%mm0, %%mm0  \n\t"\
419
    "paddw                %%mm4, %%mm2  \n\t"\
420
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
421
\
422
    "packuswb             %%mm1, %%mm1  \n\t"
423
#endif
424

    
425
#define REAL_YSCALEYUV2PACKED(index, c) \
426
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
427
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
428
    "psraw                $3, %%mm0                           \n\t"\
429
    "psraw                $3, %%mm1                           \n\t"\
430
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432
    "xor            "#index", "#index"                        \n\t"\
433
    ASMALIGN(4)\
434
    "1:                                 \n\t"\
435
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
436
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
437
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
438
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
439
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
442
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
449
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
450
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
451
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
452
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460

    
461
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
462

    
463
#define REAL_YSCALEYUV2RGB(index, c) \
464
    "xor            "#index", "#index"  \n\t"\
465
    ASMALIGN(4)\
466
    "1:                                 \n\t"\
467
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
468
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
469
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
470
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
471
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
474
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
481
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
482
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
483
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
484
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
485
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
486
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
488
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
489
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
490
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
491
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
492
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
493
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
500
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
501
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
502
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
503
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
504
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
505
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
    "paddw             %%mm3, %%mm4     \n\t"\
507
    "movq              %%mm2, %%mm0     \n\t"\
508
    "movq              %%mm5, %%mm6     \n\t"\
509
    "movq              %%mm4, %%mm3     \n\t"\
510
    "punpcklwd         %%mm2, %%mm2     \n\t"\
511
    "punpcklwd         %%mm5, %%mm5     \n\t"\
512
    "punpcklwd         %%mm4, %%mm4     \n\t"\
513
    "paddw             %%mm1, %%mm2     \n\t"\
514
    "paddw             %%mm1, %%mm5     \n\t"\
515
    "paddw             %%mm1, %%mm4     \n\t"\
516
    "punpckhwd         %%mm0, %%mm0     \n\t"\
517
    "punpckhwd         %%mm6, %%mm6     \n\t"\
518
    "punpckhwd         %%mm3, %%mm3     \n\t"\
519
    "paddw             %%mm7, %%mm0     \n\t"\
520
    "paddw             %%mm7, %%mm6     \n\t"\
521
    "paddw             %%mm7, %%mm3     \n\t"\
522
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
    "packuswb          %%mm0, %%mm2     \n\t"\
524
    "packuswb          %%mm6, %%mm5     \n\t"\
525
    "packuswb          %%mm3, %%mm4     \n\t"\
526
    "pxor              %%mm7, %%mm7     \n\t"
527
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
528

    
529
#define REAL_YSCALEYUV2PACKED1(index, c) \
530
    "xor            "#index", "#index"  \n\t"\
531
    ASMALIGN(4)\
532
    "1:                                 \n\t"\
533
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
534
    "movq 4096(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
535
    "psraw                $7, %%mm3     \n\t" \
536
    "psraw                $7, %%mm4     \n\t" \
537
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
538
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
539
    "psraw                $7, %%mm1     \n\t" \
540
    "psraw                $7, %%mm7     \n\t" \
541

    
542
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
543

    
544
#define REAL_YSCALEYUV2RGB1(index, c) \
545
    "xor            "#index", "#index"  \n\t"\
546
    ASMALIGN(4)\
547
    "1:                                 \n\t"\
548
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
549
    "movq 4096(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
550
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
553
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
554
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
555
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
556
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
557
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
558
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
560
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
561
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
564
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
565
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
566
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
567
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
568
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
569
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
    "paddw             %%mm3, %%mm4     \n\t"\
571
    "movq              %%mm2, %%mm0     \n\t"\
572
    "movq              %%mm5, %%mm6     \n\t"\
573
    "movq              %%mm4, %%mm3     \n\t"\
574
    "punpcklwd         %%mm2, %%mm2     \n\t"\
575
    "punpcklwd         %%mm5, %%mm5     \n\t"\
576
    "punpcklwd         %%mm4, %%mm4     \n\t"\
577
    "paddw             %%mm1, %%mm2     \n\t"\
578
    "paddw             %%mm1, %%mm5     \n\t"\
579
    "paddw             %%mm1, %%mm4     \n\t"\
580
    "punpckhwd         %%mm0, %%mm0     \n\t"\
581
    "punpckhwd         %%mm6, %%mm6     \n\t"\
582
    "punpckhwd         %%mm3, %%mm3     \n\t"\
583
    "paddw             %%mm7, %%mm0     \n\t"\
584
    "paddw             %%mm7, %%mm6     \n\t"\
585
    "paddw             %%mm7, %%mm3     \n\t"\
586
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
    "packuswb          %%mm0, %%mm2     \n\t"\
588
    "packuswb          %%mm6, %%mm5     \n\t"\
589
    "packuswb          %%mm3, %%mm4     \n\t"\
590
    "pxor              %%mm7, %%mm7     \n\t"
591
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
592

    
593
#define REAL_YSCALEYUV2PACKED1b(index, c) \
594
    "xor "#index", "#index"             \n\t"\
595
    ASMALIGN(4)\
596
    "1:                                 \n\t"\
597
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
598
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
599
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
600
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
601
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
    "psrlw                $8, %%mm3     \n\t" \
604
    "psrlw                $8, %%mm4     \n\t" \
605
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
606
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
607
    "psraw                $7, %%mm1     \n\t" \
608
    "psraw                $7, %%mm7     \n\t"
609
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
610

    
611
// do vertical chrominance interpolation
612
#define REAL_YSCALEYUV2RGB1b(index, c) \
613
    "xor            "#index", "#index"  \n\t"\
614
    ASMALIGN(4)\
615
    "1:                                 \n\t"\
616
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
617
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
618
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
619
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
620
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
623
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
624
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
625
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
626
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
627
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
628
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
629
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
630
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
632
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
633
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
636
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
637
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
638
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
639
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
640
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
641
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
    "paddw             %%mm3, %%mm4     \n\t"\
643
    "movq              %%mm2, %%mm0     \n\t"\
644
    "movq              %%mm5, %%mm6     \n\t"\
645
    "movq              %%mm4, %%mm3     \n\t"\
646
    "punpcklwd         %%mm2, %%mm2     \n\t"\
647
    "punpcklwd         %%mm5, %%mm5     \n\t"\
648
    "punpcklwd         %%mm4, %%mm4     \n\t"\
649
    "paddw             %%mm1, %%mm2     \n\t"\
650
    "paddw             %%mm1, %%mm5     \n\t"\
651
    "paddw             %%mm1, %%mm4     \n\t"\
652
    "punpckhwd         %%mm0, %%mm0     \n\t"\
653
    "punpckhwd         %%mm6, %%mm6     \n\t"\
654
    "punpckhwd         %%mm3, %%mm3     \n\t"\
655
    "paddw             %%mm7, %%mm0     \n\t"\
656
    "paddw             %%mm7, %%mm6     \n\t"\
657
    "paddw             %%mm7, %%mm3     \n\t"\
658
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
    "packuswb          %%mm0, %%mm2     \n\t"\
660
    "packuswb          %%mm6, %%mm5     \n\t"\
661
    "packuswb          %%mm3, %%mm4     \n\t"\
662
    "pxor              %%mm7, %%mm7     \n\t"
663
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
664

    
665
#define REAL_WRITEBGR32(dst, dstw, index) \
666
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
    "movq      %%mm2, %%mm1     \n\t" /* B */\
668
    "movq      %%mm5, %%mm6     \n\t" /* R */\
669
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
670
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
671
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
672
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
673
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
674
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
675
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
676
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
677
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
678
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
679
\
680
    MOVNTQ(%%mm0,   (dst, index, 4))\
681
    MOVNTQ(%%mm2,  8(dst, index, 4))\
682
    MOVNTQ(%%mm1, 16(dst, index, 4))\
683
    MOVNTQ(%%mm3, 24(dst, index, 4))\
684
\
685
    "add      $8, "#index"      \n\t"\
686
    "cmp "#dstw", "#index"      \n\t"\
687
    " jb      1b                \n\t"
688
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
689

    
690
#define REAL_WRITEBGR16(dst, dstw, index) \
691
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
692
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
693
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
694
    "psrlq           $3, %%mm2  \n\t"\
695
\
696
    "movq         %%mm2, %%mm1  \n\t"\
697
    "movq         %%mm4, %%mm3  \n\t"\
698
\
699
    "punpcklbw    %%mm7, %%mm3  \n\t"\
700
    "punpcklbw    %%mm5, %%mm2  \n\t"\
701
    "punpckhbw    %%mm7, %%mm4  \n\t"\
702
    "punpckhbw    %%mm5, %%mm1  \n\t"\
703
\
704
    "psllq           $3, %%mm3  \n\t"\
705
    "psllq           $3, %%mm4  \n\t"\
706
\
707
    "por          %%mm3, %%mm2  \n\t"\
708
    "por          %%mm4, %%mm1  \n\t"\
709
\
710
    MOVNTQ(%%mm2,  (dst, index, 2))\
711
    MOVNTQ(%%mm1, 8(dst, index, 2))\
712
\
713
    "add             $8, "#index"   \n\t"\
714
    "cmp        "#dstw", "#index"   \n\t"\
715
    " jb             1b             \n\t"
716
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
717

    
718
#define REAL_WRITEBGR15(dst, dstw, index) \
719
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
720
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
721
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
722
    "psrlq           $3, %%mm2  \n\t"\
723
    "psrlq           $1, %%mm5  \n\t"\
724
\
725
    "movq         %%mm2, %%mm1  \n\t"\
726
    "movq         %%mm4, %%mm3  \n\t"\
727
\
728
    "punpcklbw    %%mm7, %%mm3  \n\t"\
729
    "punpcklbw    %%mm5, %%mm2  \n\t"\
730
    "punpckhbw    %%mm7, %%mm4  \n\t"\
731
    "punpckhbw    %%mm5, %%mm1  \n\t"\
732
\
733
    "psllq           $2, %%mm3  \n\t"\
734
    "psllq           $2, %%mm4  \n\t"\
735
\
736
    "por          %%mm3, %%mm2  \n\t"\
737
    "por          %%mm4, %%mm1  \n\t"\
738
\
739
    MOVNTQ(%%mm2,  (dst, index, 2))\
740
    MOVNTQ(%%mm1, 8(dst, index, 2))\
741
\
742
    "add             $8, "#index"   \n\t"\
743
    "cmp        "#dstw", "#index"   \n\t"\
744
    " jb             1b             \n\t"
745
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
746

    
747
#define WRITEBGR24OLD(dst, dstw, index) \
748
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
    "movq      %%mm2, %%mm1             \n\t" /* B */\
750
    "movq      %%mm5, %%mm6             \n\t" /* R */\
751
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
752
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
753
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
754
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
755
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
756
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
757
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
758
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
759
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
760
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
761
\
762
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
763
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
764
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
765
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
766
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
767
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
768
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
769
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
770
\
771
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
772
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
773
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
774
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
775
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
776
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
777
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
778
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
779
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
780
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
781
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
782
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
783
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
784
\
785
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
786
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
787
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
788
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
789
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
790
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
791
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
792
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
793
\
794
    MOVNTQ(%%mm0,   (dst))\
795
    MOVNTQ(%%mm2,  8(dst))\
796
    MOVNTQ(%%mm3, 16(dst))\
797
    "add         $24, "#dst"            \n\t"\
798
\
799
    "add          $8, "#index"          \n\t"\
800
    "cmp     "#dstw", "#index"          \n\t"\
801
    " jb          1b                    \n\t"
802

    
803
#define WRITEBGR24MMX(dst, dstw, index) \
804
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
    "movq      %%mm2, %%mm1     \n\t" /* B */\
806
    "movq      %%mm5, %%mm6     \n\t" /* R */\
807
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
808
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
809
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
810
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
811
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
812
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
813
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
814
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
815
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
816
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
817
\
818
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
819
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
820
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
821
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
822
\
823
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
824
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
825
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
826
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
827
\
828
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
829
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
830
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
831
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
832
\
833
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
834
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
835
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
836
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
837
    MOVNTQ(%%mm0, (dst))\
838
\
839
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
840
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
841
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
842
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
843
    MOVNTQ(%%mm6, 8(dst))\
844
\
845
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
846
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
847
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
848
    MOVNTQ(%%mm5, 16(dst))\
849
\
850
    "add         $24, "#dst"    \n\t"\
851
\
852
    "add          $8, "#index"  \n\t"\
853
    "cmp     "#dstw", "#index"  \n\t"\
854
    " jb          1b            \n\t"
855

    
856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
    "movq "MANGLE(M24A)", %%mm0 \n\t"\
859
    "movq "MANGLE(M24C)", %%mm7 \n\t"\
860
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
861
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
862
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
863
\
864
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
865
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
866
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
867
\
868
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
869
    "por    %%mm1, %%mm6        \n\t"\
870
    "por    %%mm3, %%mm6        \n\t"\
871
    MOVNTQ(%%mm6, (dst))\
872
\
873
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
874
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
875
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
876
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
877
\
878
    "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
879
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
880
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
881
\
882
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
883
    "por    %%mm3, %%mm6        \n\t"\
884
    MOVNTQ(%%mm6, 8(dst))\
885
\
886
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
887
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
888
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
889
\
890
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
891
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
892
    "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
893
\
894
    "por    %%mm1, %%mm3        \n\t"\
895
    "por    %%mm3, %%mm6        \n\t"\
896
    MOVNTQ(%%mm6, 16(dst))\
897
\
898
    "add      $24, "#dst"       \n\t"\
899
\
900
    "add       $8, "#index"     \n\t"\
901
    "cmp  "#dstw", "#index"     \n\t"\
902
    " jb       1b               \n\t"
903

    
904
#ifdef HAVE_MMX2
905
#undef WRITEBGR24
906
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
907
#else
908
#undef WRITEBGR24
909
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
910
#endif
911

    
912
#define REAL_WRITEYUY2(dst, dstw, index) \
913
    "packuswb  %%mm3, %%mm3     \n\t"\
914
    "packuswb  %%mm4, %%mm4     \n\t"\
915
    "packuswb  %%mm7, %%mm1     \n\t"\
916
    "punpcklbw %%mm4, %%mm3     \n\t"\
917
    "movq      %%mm1, %%mm7     \n\t"\
918
    "punpcklbw %%mm3, %%mm1     \n\t"\
919
    "punpckhbw %%mm3, %%mm7     \n\t"\
920
\
921
    MOVNTQ(%%mm1, (dst, index, 2))\
922
    MOVNTQ(%%mm7, 8(dst, index, 2))\
923
\
924
    "add          $8, "#index"  \n\t"\
925
    "cmp     "#dstw", "#index"  \n\t"\
926
    " jb          1b            \n\t"
927
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
928

    
929

    
930
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
933
{
934
#ifdef HAVE_MMX
935
    if (c->flags & SWS_ACCURATE_RND){
936
        if (uDest){
937
            YSCALEYUV2YV12X_ACCURATE(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
            YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939
        }
940

    
941
        YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942
    }else{
943
        if (uDest){
944
            YSCALEYUV2YV12X(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
            YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946
        }
947

    
948
        YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
949
    }
950
#else
951
#ifdef HAVE_ALTIVEC
952
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953
                      chrFilter, chrSrc, chrFilterSize,
954
                      dest, uDest, vDest, dstW, chrDstW);
955
#else //HAVE_ALTIVEC
956
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957
            chrFilter, chrSrc, chrFilterSize,
958
            dest, uDest, vDest, dstW, chrDstW);
959
#endif //!HAVE_ALTIVEC
960
#endif
961
}
962

    
963
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
966
{
967
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968
             chrFilter, chrSrc, chrFilterSize,
969
             dest, uDest, dstW, chrDstW, dstFormat);
970
}
971

    
972
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
974
{
975
#ifdef HAVE_MMX
976
    if (uDest != NULL)
977
    {
978
        asm volatile(
979
            YSCALEYUV2YV121
980
            :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981
            "g" (-chrDstW)
982
            : "%"REG_a
983
        );
984

    
985
        asm volatile(
986
            YSCALEYUV2YV121
987
            :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
988
            "g" (-chrDstW)
989
            : "%"REG_a
990
        );
991
    }
992

    
993
    asm volatile(
994
        YSCALEYUV2YV121
995
        :: "r" (lumSrc + dstW), "r" (dest + dstW),
996
        "g" (-dstW)
997
        : "%"REG_a
998
    );
999
#else
1000
    int i;
1001
    for (i=0; i<dstW; i++)
1002
    {
1003
        int val= lumSrc[i]>>7;
1004

    
1005
        if (val&256){
1006
            if (val<0) val=0;
1007
            else       val=255;
1008
        }
1009

    
1010
        dest[i]= val;
1011
    }
1012

    
1013
    if (uDest != NULL)
1014
        for (i=0; i<chrDstW; i++)
1015
        {
1016
            int u=chrSrc[i]>>7;
1017
            int v=chrSrc[i + 2048]>>7;
1018

    
1019
            if ((u|v)&256){
1020
                if (u<0)        u=0;
1021
                else if (u>255) u=255;
1022
                if (v<0)        v=0;
1023
                else if (v>255) v=255;
1024
            }
1025

    
1026
            uDest[i]= u;
1027
            vDest[i]= v;
1028
        }
1029
#endif
1030
}
1031

    
1032

    
1033
/**
1034
 * vertical scale YV12 to RGB
1035
 */
1036
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038
                                       uint8_t *dest, long dstW, long dstY)
1039
{
1040
#ifdef HAVE_MMX
1041
    long dummy=0;
1042
    if (c->flags & SWS_ACCURATE_RND){
1043
        switch(c->dstFormat){
1044
        case PIX_FMT_RGB32:
1045
            YSCALEYUV2PACKEDX_ACCURATE
1046
            YSCALEYUV2RGBX
1047
            WRITEBGR32(%4, %5, %%REGa)
1048

    
1049
            YSCALEYUV2PACKEDX_END
1050
            return;
1051
        case PIX_FMT_BGR24:
1052
            YSCALEYUV2PACKEDX_ACCURATE
1053
            YSCALEYUV2RGBX
1054
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055
            "add %4, %%"REG_c"                        \n\t"
1056
            WRITEBGR24(%%REGc, %5, %%REGa)
1057

    
1058

    
1059
            :: "r" (&c->redDither),
1060
               "m" (dummy), "m" (dummy), "m" (dummy),
1061
               "r" (dest), "m" (dstW)
1062
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063
            );
1064
            return;
1065
        case PIX_FMT_BGR555:
1066
            YSCALEYUV2PACKEDX_ACCURATE
1067
            YSCALEYUV2RGBX
1068
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069
#ifdef DITHER1XBPP
1070
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073
#endif
1074

    
1075
            WRITEBGR15(%4, %5, %%REGa)
1076
            YSCALEYUV2PACKEDX_END
1077
            return;
1078
        case PIX_FMT_BGR565:
1079
            YSCALEYUV2PACKEDX_ACCURATE
1080
            YSCALEYUV2RGBX
1081
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082
#ifdef DITHER1XBPP
1083
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086
#endif
1087

    
1088
            WRITEBGR16(%4, %5, %%REGa)
1089
            YSCALEYUV2PACKEDX_END
1090
            return;
1091
        case PIX_FMT_YUYV422:
1092
            YSCALEYUV2PACKEDX_ACCURATE
1093
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094

    
1095
            "psraw $3, %%mm3    \n\t"
1096
            "psraw $3, %%mm4    \n\t"
1097
            "psraw $3, %%mm1    \n\t"
1098
            "psraw $3, %%mm7    \n\t"
1099
            WRITEYUY2(%4, %5, %%REGa)
1100
            YSCALEYUV2PACKEDX_END
1101
            return;
1102
    }
1103
    }else{
1104
        switch(c->dstFormat)
1105
        {
1106
        case PIX_FMT_RGB32:
1107
            YSCALEYUV2PACKEDX
1108
            YSCALEYUV2RGBX
1109
            WRITEBGR32(%4, %5, %%REGa)
1110
            YSCALEYUV2PACKEDX_END
1111
            return;
1112
        case PIX_FMT_BGR24:
1113
            YSCALEYUV2PACKEDX
1114
            YSCALEYUV2RGBX
1115
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1116
            "add                        %4, %%"REG_c"   \n\t"
1117
            WRITEBGR24(%%REGc, %5, %%REGa)
1118

    
1119
            :: "r" (&c->redDither),
1120
               "m" (dummy), "m" (dummy), "m" (dummy),
1121
               "r" (dest),  "m" (dstW)
1122
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123
            );
1124
            return;
1125
        case PIX_FMT_BGR555:
1126
            YSCALEYUV2PACKEDX
1127
            YSCALEYUV2RGBX
1128
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129
#ifdef DITHER1XBPP
1130
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1131
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1132
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1133
#endif
1134

    
1135
            WRITEBGR15(%4, %5, %%REGa)
1136
            YSCALEYUV2PACKEDX_END
1137
            return;
1138
        case PIX_FMT_BGR565:
1139
            YSCALEYUV2PACKEDX
1140
            YSCALEYUV2RGBX
1141
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142
#ifdef DITHER1XBPP
1143
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1144
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1145
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1146
#endif
1147

    
1148
            WRITEBGR16(%4, %5, %%REGa)
1149
            YSCALEYUV2PACKEDX_END
1150
            return;
1151
        case PIX_FMT_YUYV422:
1152
            YSCALEYUV2PACKEDX
1153
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154

    
1155
            "psraw $3, %%mm3    \n\t"
1156
            "psraw $3, %%mm4    \n\t"
1157
            "psraw $3, %%mm1    \n\t"
1158
            "psraw $3, %%mm7    \n\t"
1159
            WRITEYUY2(%4, %5, %%REGa)
1160
            YSCALEYUV2PACKEDX_END
1161
            return;
1162
        }
1163
    }
1164
#endif
1165
#ifdef HAVE_ALTIVEC
1166
    /* The following list of supported dstFormat values should
1167
       match what's found in the body of altivec_yuv2packedX() */
1168
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1169
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1171
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172
                                 chrFilter, chrSrc, chrFilterSize,
1173
                                 dest, dstW, dstY);
1174
    else
1175
#endif
1176
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177
                       chrFilter, chrSrc, chrFilterSize,
1178
                       dest, dstW, dstY);
1179
}
1180

    
1181
/**
1182
 * vertical bilinear scale YV12 to RGB
1183
 */
1184
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1186
{
1187
    int yalpha1=yalpha^4095;
1188
    int uvalpha1=uvalpha^4095;
1189
    int i;
1190

    
1191
#if 0 //isn't used
1192
    if (flags&SWS_FULL_CHR_H_INT)
1193
    {
1194
        switch(dstFormat)
1195
        {
1196
#ifdef HAVE_MMX
1197
        case PIX_FMT_RGB32:
1198
            asm volatile(
1199

1200

1201
FULL_YSCALEYUV2RGB
1202
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1203
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1204

1205
            "movq      %%mm3, %%mm1    \n\t"
1206
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1207
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1208

1209
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1210
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1211

1212
            "add $4, %%"REG_a"  \n\t"
1213
            "cmp %5, %%"REG_a"  \n\t"
1214
            " jb 1b             \n\t"
1215

1216
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217
            "m" (yalpha1), "m" (uvalpha1)
1218
            : "%"REG_a
1219
            );
1220
            break;
1221
        case PIX_FMT_BGR24:
1222
            asm volatile(
1223

1224
FULL_YSCALEYUV2RGB
1225

1226
                                              // lsb ... msb
1227
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1228
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1229

1230
            "movq      %%mm3, %%mm1     \n\t"
1231
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1232
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1233

1234
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1235
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1236
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1237
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1238
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1239
            "movq      %%mm1, %%mm2     \n\t"
1240
            "psllq       $48, %%mm1     \n\t" // 000000BG
1241
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1242

1243
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1244
            "psrld       $16, %%mm2     \n\t" // R000R000
1245
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1246
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1247

1248
            "mov          %4, %%"REG_b" \n\t"
1249
            "add   %%"REG_a", %%"REG_b" \n\t"
1250

1251
#ifdef HAVE_MMX2
1252
            //FIXME Alignment
1253
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1254
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1255
#else
1256
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1257
            "psrlq  $32, %%mm3                          \n\t"
1258
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1259
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1260
#endif
1261
            "add     $4, %%"REG_a"                      \n\t"
1262
            "cmp     %5, %%"REG_a"                      \n\t"
1263
            " jb     1b                                 \n\t"
1264

    
1265
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266
            "m" (yalpha1), "m" (uvalpha1)
1267
            : "%"REG_a, "%"REG_b
1268
            );
1269
            break;
1270
        case PIX_FMT_BGR555:
1271
            asm volatile(
1272

    
1273
FULL_YSCALEYUV2RGB
1274
#ifdef DITHER1XBPP
1275
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1276
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1277
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1278
#endif
1279
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1280
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1281
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1282

    
1283
            "psrlw                   $3, %%mm3  \n\t"
1284
            "psllw                   $2, %%mm1  \n\t"
1285
            "psllw                   $7, %%mm0  \n\t"
1286
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1287
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1288

    
1289
            "por                  %%mm3, %%mm1  \n\t"
1290
            "por                  %%mm1, %%mm0  \n\t"
1291

    
1292
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1293

    
1294
            "add $4, %%"REG_a"  \n\t"
1295
            "cmp %5, %%"REG_a"  \n\t"
1296
            " jb 1b             \n\t"
1297

    
1298
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299
            "m" (yalpha1), "m" (uvalpha1)
1300
            : "%"REG_a
1301
            );
1302
            break;
1303
        case PIX_FMT_BGR565:
1304
            asm volatile(
1305

    
1306
FULL_YSCALEYUV2RGB
1307
#ifdef DITHER1XBPP
1308
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1309
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1310
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1311
#endif
1312
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1313
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1314
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1315

    
1316
            "psrlw                   $3, %%mm3  \n\t"
1317
            "psllw                   $3, %%mm1  \n\t"
1318
            "psllw                   $8, %%mm0  \n\t"
1319
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1320
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1321

    
1322
            "por                  %%mm3, %%mm1  \n\t"
1323
            "por                  %%mm1, %%mm0  \n\t"
1324

    
1325
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1326

    
1327
            "add $4, %%"REG_a"  \n\t"
1328
            "cmp %5, %%"REG_a"  \n\t"
1329
            " jb 1b             \n\t"
1330

    
1331
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332
            "m" (yalpha1), "m" (uvalpha1)
1333
            : "%"REG_a
1334
            );
1335
            break;
1336
#endif
1337
        case PIX_FMT_BGR32:
1338
#ifndef HAVE_MMX
1339
        case PIX_FMT_RGB32:
1340
#endif
1341
            if (dstFormat==PIX_FMT_RGB32)
1342
            {
1343
                int i;
1344
#ifdef WORDS_BIGENDIAN
1345
                dest++;
1346
#endif
1347
                for (i=0;i<dstW;i++){
1348
                    // vertical linear interpolation && yuv2rgb in a single step:
1349
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1352
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355
                    dest+= 4;
1356
                }
1357
            }
1358
            else if (dstFormat==PIX_FMT_BGR24)
1359
            {
1360
                int i;
1361
                for (i=0;i<dstW;i++){
1362
                    // vertical linear interpolation && yuv2rgb in a single step:
1363
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1366
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369
                    dest+= 3;
1370
                }
1371
            }
1372
            else if (dstFormat==PIX_FMT_BGR565)
1373
            {
1374
                int i;
1375
                for (i=0;i<dstW;i++){
1376
                    // vertical linear interpolation && yuv2rgb in a single step:
1377
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1380

    
1381
                    ((uint16_t*)dest)[i] =
1382
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1385
                }
1386
            }
1387
            else if (dstFormat==PIX_FMT_BGR555)
1388
            {
1389
                int i;
1390
                for (i=0;i<dstW;i++){
1391
                    // vertical linear interpolation && yuv2rgb in a single step:
1392
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1395

    
1396
                    ((uint16_t*)dest)[i] =
1397
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1400
                }
1401
            }
1402
        }//FULL_UV_IPOL
1403
    else
1404
    {
1405
#endif // if 0
1406
#ifdef HAVE_MMX
1407
        switch(c->dstFormat)
1408
        {
1409
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410
            case PIX_FMT_RGB32:
1411
                asm volatile(
1412
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1413
                "mov        %4, %%"REG_b"               \n\t"
1414
                "push %%"REG_BP"                        \n\t"
1415
                YSCALEYUV2RGB(%%REGBP, %5)
1416
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417
                "pop %%"REG_BP"                         \n\t"
1418
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1419

    
1420
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421
                "a" (&c->redDither)
1422
                );
1423
                return;
1424
            case PIX_FMT_BGR24:
1425
                asm volatile(
1426
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1427
                "mov        %4, %%"REG_b"               \n\t"
1428
                "push %%"REG_BP"                        \n\t"
1429
                YSCALEYUV2RGB(%%REGBP, %5)
1430
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431
                "pop %%"REG_BP"                         \n\t"
1432
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1433
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434
                "a" (&c->redDither)
1435
                );
1436
                return;
1437
            case PIX_FMT_BGR555:
1438
                asm volatile(
1439
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1440
                "mov        %4, %%"REG_b"               \n\t"
1441
                "push %%"REG_BP"                        \n\t"
1442
                YSCALEYUV2RGB(%%REGBP, %5)
1443
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444
#ifdef DITHER1XBPP
1445
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1446
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1447
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1448
#endif
1449

    
1450
                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451
                "pop %%"REG_BP"                         \n\t"
1452
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1453

    
1454
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455
                "a" (&c->redDither)
1456
                );
1457
                return;
1458
            case PIX_FMT_BGR565:
1459
                asm volatile(
1460
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1461
                "mov        %4, %%"REG_b"               \n\t"
1462
                "push %%"REG_BP"                        \n\t"
1463
                YSCALEYUV2RGB(%%REGBP, %5)
1464
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465
#ifdef DITHER1XBPP
1466
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1467
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1468
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1469
#endif
1470

    
1471
                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472
                "pop %%"REG_BP"                         \n\t"
1473
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1474
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                "a" (&c->redDither)
1476
                );
1477
                return;
1478
            case PIX_FMT_YUYV422:
1479
                asm volatile(
1480
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1481
                "mov %4, %%"REG_b"                        \n\t"
1482
                "push %%"REG_BP"                        \n\t"
1483
                YSCALEYUV2PACKED(%%REGBP, %5)
1484
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485
                "pop %%"REG_BP"                         \n\t"
1486
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1487
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488
                "a" (&c->redDither)
1489
                );
1490
                return;
1491
            default: break;
1492
        }
1493
#endif //HAVE_MMX
1494
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1495
}
1496

    
1497
/**
1498
 * YV12 to RGB without scaling or interpolating
1499
 */
1500
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1502
{
1503
    const int yalpha1=0;
1504
    int i;
1505

    
1506
    uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507
    const int yalpha= 4096; //FIXME ...
1508

    
1509
    if (flags&SWS_FULL_CHR_H_INT)
1510
    {
1511
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512
        return;
1513
    }
1514

    
1515
#ifdef HAVE_MMX
1516
    if ( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
1517
    {
1518
        switch(dstFormat)
1519
        {
1520
        case PIX_FMT_RGB32:
1521
            asm volatile(
1522
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1523
            "mov        %4, %%"REG_b"               \n\t"
1524
            "push %%"REG_BP"                        \n\t"
1525
            YSCALEYUV2RGB1(%%REGBP, %5)
1526
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527
            "pop %%"REG_BP"                         \n\t"
1528
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1529

    
1530
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531
            "a" (&c->redDither)
1532
            );
1533
            return;
1534
        case PIX_FMT_BGR24:
1535
            asm volatile(
1536
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1537
            "mov        %4, %%"REG_b"               \n\t"
1538
            "push %%"REG_BP"                        \n\t"
1539
            YSCALEYUV2RGB1(%%REGBP, %5)
1540
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541
            "pop %%"REG_BP"                         \n\t"
1542
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543

    
1544
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545
            "a" (&c->redDither)
1546
            );
1547
            return;
1548
        case PIX_FMT_BGR555:
1549
            asm volatile(
1550
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
            "mov        %4, %%"REG_b"               \n\t"
1552
            "push %%"REG_BP"                        \n\t"
1553
            YSCALEYUV2RGB1(%%REGBP, %5)
1554
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555
#ifdef DITHER1XBPP
1556
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1557
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1558
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1559
#endif
1560
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561
            "pop %%"REG_BP"                         \n\t"
1562
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1563

    
1564
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565
            "a" (&c->redDither)
1566
            );
1567
            return;
1568
        case PIX_FMT_BGR565:
1569
            asm volatile(
1570
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1571
            "mov        %4, %%"REG_b"               \n\t"
1572
            "push %%"REG_BP"                        \n\t"
1573
            YSCALEYUV2RGB1(%%REGBP, %5)
1574
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575
#ifdef DITHER1XBPP
1576
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1577
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1578
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1579
#endif
1580

    
1581
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582
            "pop %%"REG_BP"                         \n\t"
1583
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1584

    
1585
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586
            "a" (&c->redDither)
1587
            );
1588
            return;
1589
        case PIX_FMT_YUYV422:
1590
            asm volatile(
1591
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1592
            "mov        %4, %%"REG_b"               \n\t"
1593
            "push %%"REG_BP"                        \n\t"
1594
            YSCALEYUV2PACKED1(%%REGBP, %5)
1595
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596
            "pop %%"REG_BP"                         \n\t"
1597
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1598

    
1599
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600
            "a" (&c->redDither)
1601
            );
1602
            return;
1603
        }
1604
    }
1605
    else
1606
    {
1607
        switch(dstFormat)
1608
        {
1609
        case PIX_FMT_RGB32:
1610
            asm volatile(
1611
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1612
            "mov        %4, %%"REG_b"               \n\t"
1613
            "push %%"REG_BP"                        \n\t"
1614
            YSCALEYUV2RGB1b(%%REGBP, %5)
1615
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616
            "pop %%"REG_BP"                         \n\t"
1617
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1618

    
1619
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620
            "a" (&c->redDither)
1621
            );
1622
            return;
1623
        case PIX_FMT_BGR24:
1624
            asm volatile(
1625
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1626
            "mov        %4, %%"REG_b"               \n\t"
1627
            "push %%"REG_BP"                        \n\t"
1628
            YSCALEYUV2RGB1b(%%REGBP, %5)
1629
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630
            "pop %%"REG_BP"                         \n\t"
1631
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1632

    
1633
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634
            "a" (&c->redDither)
1635
            );
1636
            return;
1637
        case PIX_FMT_BGR555:
1638
            asm volatile(
1639
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1640
            "mov        %4, %%"REG_b"               \n\t"
1641
            "push %%"REG_BP"                        \n\t"
1642
            YSCALEYUV2RGB1b(%%REGBP, %5)
1643
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644
#ifdef DITHER1XBPP
1645
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1646
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1647
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1648
#endif
1649
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650
            "pop %%"REG_BP"                         \n\t"
1651
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1652

    
1653
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654
            "a" (&c->redDither)
1655
            );
1656
            return;
1657
        case PIX_FMT_BGR565:
1658
            asm volatile(
1659
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1660
            "mov        %4, %%"REG_b"               \n\t"
1661
            "push %%"REG_BP"                        \n\t"
1662
            YSCALEYUV2RGB1b(%%REGBP, %5)
1663
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664
#ifdef DITHER1XBPP
1665
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1666
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1667
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1668
#endif
1669

    
1670
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671
            "pop %%"REG_BP"                         \n\t"
1672
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1673

    
1674
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675
            "a" (&c->redDither)
1676
            );
1677
            return;
1678
        case PIX_FMT_YUYV422:
1679
            asm volatile(
1680
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1681
            "mov        %4, %%"REG_b"               \n\t"
1682
            "push %%"REG_BP"                        \n\t"
1683
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1684
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685
            "pop %%"REG_BP"                         \n\t"
1686
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1687

    
1688
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689
            "a" (&c->redDither)
1690
            );
1691
            return;
1692
        }
1693
    }
1694
#endif
1695
    if ( uvalpha < 2048 )
1696
    {
1697
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698
    }else{
1699
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1700
    }
1701
}
1702

    
1703
//FIXME yuy2* can read upto 7 samples to much
1704

    
1705
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1706
{
1707
#ifdef HAVE_MMX
1708
    asm volatile(
1709
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1710
    "mov                    %0, %%"REG_a"       \n\t"
1711
    "1:                                         \n\t"
1712
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1713
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1714
    "pand                %%mm2, %%mm0           \n\t"
1715
    "pand                %%mm2, %%mm1           \n\t"
1716
    "packuswb            %%mm1, %%mm0           \n\t"
1717
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1718
    "add                    $8, %%"REG_a"       \n\t"
1719
    " js                    1b                  \n\t"
1720
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721
    : "%"REG_a
1722
    );
1723
#else
1724
    int i;
1725
    for (i=0; i<width; i++)
1726
        dst[i]= src[2*i];
1727
#endif
1728
}
1729

    
1730
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1731
{
1732
#ifdef HAVE_MMX
1733
    asm volatile(
1734
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1735
    "mov                    %0, %%"REG_a"       \n\t"
1736
    "1:                                         \n\t"
1737
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1738
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1739
    "psrlw                  $8, %%mm0           \n\t"
1740
    "psrlw                  $8, %%mm1           \n\t"
1741
    "packuswb            %%mm1, %%mm0           \n\t"
1742
    "movq                %%mm0, %%mm1           \n\t"
1743
    "psrlw                  $8, %%mm0           \n\t"
1744
    "pand                %%mm4, %%mm1           \n\t"
1745
    "packuswb            %%mm0, %%mm0           \n\t"
1746
    "packuswb            %%mm1, %%mm1           \n\t"
1747
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1748
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1749
    "add                    $4, %%"REG_a"       \n\t"
1750
    " js                    1b                  \n\t"
1751
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752
    : "%"REG_a
1753
    );
1754
#else
1755
    int i;
1756
    for (i=0; i<width; i++)
1757
    {
1758
        dstU[i]= src1[4*i + 1];
1759
        dstV[i]= src1[4*i + 3];
1760
    }
1761
#endif
1762
    assert(src1 == src2);
1763
}
1764

    
1765
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1766
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1767
{
1768
#ifdef HAVE_MMX
1769
    asm volatile(
1770
    "mov                  %0, %%"REG_a"         \n\t"
1771
    "1:                                         \n\t"
1772
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1773
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1774
    "psrlw                $8, %%mm0             \n\t"
1775
    "psrlw                $8, %%mm1             \n\t"
1776
    "packuswb          %%mm1, %%mm0             \n\t"
1777
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1778
    "add                  $8, %%"REG_a"         \n\t"
1779
    " js                  1b                    \n\t"
1780
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1781
    : "%"REG_a
1782
    );
1783
#else
1784
    int i;
1785
    for (i=0; i<width; i++)
1786
        dst[i]= src[2*i+1];
1787
#endif
1788
}
1789

    
1790
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1791
{
1792
#ifdef HAVE_MMX
1793
    asm volatile(
1794
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1795
    "mov                    %0, %%"REG_a"       \n\t"
1796
    "1:                                         \n\t"
1797
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1798
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1799
    "pand                %%mm4, %%mm0           \n\t"
1800
    "pand                %%mm4, %%mm1           \n\t"
1801
    "packuswb            %%mm1, %%mm0           \n\t"
1802
    "movq                %%mm0, %%mm1           \n\t"
1803
    "psrlw                  $8, %%mm0           \n\t"
1804
    "pand                %%mm4, %%mm1           \n\t"
1805
    "packuswb            %%mm0, %%mm0           \n\t"
1806
    "packuswb            %%mm1, %%mm1           \n\t"
1807
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1808
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1809
    "add                    $4, %%"REG_a"       \n\t"
1810
    " js                    1b                  \n\t"
1811
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1812
    : "%"REG_a
1813
    );
1814
#else
1815
    int i;
1816
    for (i=0; i<width; i++)
1817
    {
1818
        dstU[i]= src1[4*i + 0];
1819
        dstV[i]= src1[4*i + 2];
1820
    }
1821
#endif
1822
    assert(src1 == src2);
1823
}
1824

    
1825
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1826
{
1827
    int i;
1828
    for (i=0; i<width; i++)
1829
    {
1830
        int b=  ((uint32_t*)src)[i]&0xFF;
1831
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
1832
        int r= (((uint32_t*)src)[i]>>16)&0xFF;
1833

    
1834
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1835
    }
1836
}
1837

    
1838
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1839
{
1840
    int i;
1841
    assert(src1 == src2);
1842
    for (i=0; i<width; i++)
1843
    {
1844
        const int a= ((uint32_t*)src1)[2*i+0];
1845
        const int e= ((uint32_t*)src1)[2*i+1];
1846
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
1847
        const int h= (a&0x00FF00) + (e&0x00FF00);
1848
        const int b=  l&0x3FF;
1849
        const int g=  h>>8;
1850
        const int r=  l>>16;
1851

    
1852
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1853
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854
    }
1855
}
1856

    
1857
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1858
{
1859
#ifdef HAVE_MMX
1860
    asm volatile(
1861
    "mov                        %2, %%"REG_a"   \n\t"
1862
    "movq     "MANGLE(bgr2YCoeff)", %%mm6       \n\t"
1863
    "movq          "MANGLE(w1111)", %%mm5       \n\t"
1864
    "pxor                    %%mm7, %%mm7       \n\t"
1865
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1866
    ASMALIGN(4)
1867
    "1:                                         \n\t"
1868
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1869
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1870
    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1871
    "punpcklbw               %%mm7, %%mm0       \n\t"
1872
    "punpcklbw               %%mm7, %%mm1       \n\t"
1873
    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1874
    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1875
    "punpcklbw               %%mm7, %%mm2       \n\t"
1876
    "punpcklbw               %%mm7, %%mm3       \n\t"
1877
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1878
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1879
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1880
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1881
#ifndef FAST_BGR2YV12
1882
    "psrad                      $8, %%mm0       \n\t"
1883
    "psrad                      $8, %%mm1       \n\t"
1884
    "psrad                      $8, %%mm2       \n\t"
1885
    "psrad                      $8, %%mm3       \n\t"
1886
#endif
1887
    "packssdw                %%mm1, %%mm0       \n\t"
1888
    "packssdw                %%mm3, %%mm2       \n\t"
1889
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1890
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1891
    "packssdw                %%mm2, %%mm0       \n\t"
1892
    "psraw                      $7, %%mm0       \n\t"
1893

    
1894
    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1895
    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1896
    "punpcklbw               %%mm7, %%mm4       \n\t"
1897
    "punpcklbw               %%mm7, %%mm1       \n\t"
1898
    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1899
    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1900
    "punpcklbw               %%mm7, %%mm2       \n\t"
1901
    "punpcklbw               %%mm7, %%mm3       \n\t"
1902
    "pmaddwd                 %%mm6, %%mm4       \n\t"
1903
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1904
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1905
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1906
#ifndef FAST_BGR2YV12
1907
    "psrad                      $8, %%mm4       \n\t"
1908
    "psrad                      $8, %%mm1       \n\t"
1909
    "psrad                      $8, %%mm2       \n\t"
1910
    "psrad                      $8, %%mm3       \n\t"
1911
#endif
1912
    "packssdw                %%mm1, %%mm4       \n\t"
1913
    "packssdw                %%mm3, %%mm2       \n\t"
1914
    "pmaddwd                 %%mm5, %%mm4       \n\t"
1915
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1916
    "add                       $24, %%"REG_d"   \n\t"
1917
    "packssdw                %%mm2, %%mm4       \n\t"
1918
    "psraw                      $7, %%mm4       \n\t"
1919

    
1920
    "packuswb                %%mm4, %%mm0       \n\t"
1921
    "paddusb "MANGLE(bgr2YOffset)", %%mm0       \n\t"
1922

    
1923
    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
1924
    "add                        $8, %%"REG_a"   \n\t"
1925
    " js                        1b              \n\t"
1926
    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1927
    : "%"REG_a, "%"REG_d
1928
    );
1929
#else
1930
    int i;
1931
    for (i=0; i<width; i++)
1932
    {
1933
        int b= src[i*3+0];
1934
        int g= src[i*3+1];
1935
        int r= src[i*3+2];
1936

    
1937
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
1938
    }
1939
#endif
1940
}
1941

    
1942
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1943
{
1944
#ifdef HAVE_MMX
1945
    asm volatile(
1946
    "mov                        %3, %%"REG_a"   \n\t"
1947
    "movq          "MANGLE(w1111)", %%mm5       \n\t"
1948
    "movq     "MANGLE(bgr2UCoeff)", %%mm6       \n\t"
1949
    "pxor                    %%mm7, %%mm7       \n\t"
1950
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1951
    "add                 %%"REG_d", %%"REG_d"   \n\t"
1952
    ASMALIGN(4)
1953
    "1:                                         \n\t"
1954
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1955
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1956
    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1957
    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1958
    "movq                    %%mm0, %%mm1       \n\t"
1959
    "movq                    %%mm2, %%mm3       \n\t"
1960
    "psrlq                     $24, %%mm0       \n\t"
1961
    "psrlq                     $24, %%mm2       \n\t"
1962
    PAVGB(%%mm1, %%mm0)
1963
    PAVGB(%%mm3, %%mm2)
1964
    "punpcklbw               %%mm7, %%mm0       \n\t"
1965
    "punpcklbw               %%mm7, %%mm2       \n\t"
1966
#else
1967
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1968
    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1969
    "punpcklbw               %%mm7, %%mm0       \n\t"
1970
    "punpcklbw               %%mm7, %%mm2       \n\t"
1971
    "paddw                   %%mm2, %%mm0       \n\t"
1972
    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1973
    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1974
    "punpcklbw               %%mm7, %%mm4       \n\t"
1975
    "punpcklbw               %%mm7, %%mm2       \n\t"
1976
    "paddw                   %%mm4, %%mm2       \n\t"
1977
    "psrlw                      $1, %%mm0       \n\t"
1978
    "psrlw                      $1, %%mm2       \n\t"
1979
#endif
1980
    "movq     "MANGLE(bgr2VCoeff)", %%mm1       \n\t"
1981
    "movq     "MANGLE(bgr2VCoeff)", %%mm3       \n\t"
1982

    
1983
    "pmaddwd                 %%mm0, %%mm1       \n\t"
1984
    "pmaddwd                 %%mm2, %%mm3       \n\t"
1985
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1986
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1987
#ifndef FAST_BGR2YV12
1988
    "psrad                      $8, %%mm0       \n\t"
1989
    "psrad                      $8, %%mm1       \n\t"
1990
    "psrad                      $8, %%mm2       \n\t"
1991
    "psrad                      $8, %%mm3       \n\t"
1992
#endif
1993
    "packssdw                %%mm2, %%mm0       \n\t"
1994
    "packssdw                %%mm3, %%mm1       \n\t"
1995
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1996
    "pmaddwd                 %%mm5, %%mm1       \n\t"
1997
    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1998
    "psraw                      $7, %%mm0       \n\t"
1999

    
2000
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2001
    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
2002
    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
2003
    "movq                   %%mm4, %%mm1       \n\t"
2004
    "movq                   %%mm2, %%mm3       \n\t"
2005
    "psrlq                    $24, %%mm4       \n\t"
2006
    "psrlq                    $24, %%mm2       \n\t"
2007
    PAVGB(%%mm1, %%mm4)
2008
    PAVGB(%%mm3, %%mm2)
2009
    "punpcklbw              %%mm7, %%mm4       \n\t"
2010
    "punpcklbw              %%mm7, %%mm2       \n\t"
2011
#else
2012
    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
2013
    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
2014
    "punpcklbw              %%mm7, %%mm4       \n\t"
2015
    "punpcklbw              %%mm7, %%mm2       \n\t"
2016
    "paddw                  %%mm2, %%mm4       \n\t"
2017
    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
2018
    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
2019
    "punpcklbw              %%mm7, %%mm5       \n\t"
2020
    "punpcklbw              %%mm7, %%mm2       \n\t"
2021
    "paddw                  %%mm5, %%mm2       \n\t"
2022
    "movq         "MANGLE(w1111)", %%mm5       \n\t"
2023
    "psrlw                     $2, %%mm4       \n\t"
2024
    "psrlw                     $2, %%mm2       \n\t"
2025
#endif
2026
    "movq    "MANGLE(bgr2VCoeff)", %%mm1       \n\t"
2027
    "movq    "MANGLE(bgr2VCoeff)", %%mm3       \n\t"
2028

    
2029
    "pmaddwd                %%mm4, %%mm1       \n\t"
2030
    "pmaddwd                %%mm2, %%mm3       \n\t"
2031
    "pmaddwd                %%mm6, %%mm4       \n\t"
2032
    "pmaddwd                %%mm6, %%mm2       \n\t"
2033
#ifndef FAST_BGR2YV12
2034
    "psrad                     $8, %%mm4       \n\t"
2035
    "psrad                     $8, %%mm1       \n\t"
2036
    "psrad                     $8, %%mm2       \n\t"
2037
    "psrad                     $8, %%mm3       \n\t"
2038
#endif
2039
    "packssdw               %%mm2, %%mm4       \n\t"
2040
    "packssdw               %%mm3, %%mm1       \n\t"
2041
    "pmaddwd                %%mm5, %%mm4       \n\t"
2042
    "pmaddwd                %%mm5, %%mm1       \n\t"
2043
    "add                      $24, %%"REG_d"   \n\t"
2044
    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2045
    "psraw                     $7, %%mm4       \n\t"
2046

    
2047
    "movq                   %%mm0, %%mm1       \n\t"
2048
    "punpckldq              %%mm4, %%mm0       \n\t"
2049
    "punpckhdq              %%mm4, %%mm1       \n\t"
2050
    "packsswb               %%mm1, %%mm0       \n\t"
2051
    "paddb "MANGLE(bgr2UVOffset)", %%mm0       \n\t"
2052

    
2053
    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
2054
    "punpckhdq              %%mm0, %%mm0            \n\t"
2055
    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
2056
    "add                       $4, %%"REG_a"        \n\t"
2057
    " js                       1b                   \n\t"
2058
    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2059
    : "%"REG_a, "%"REG_d
2060
    );
2061
#else
2062
    int i;
2063
    for (i=0; i<width; i++)
2064
    {
2065
        int b= src1[6*i + 0] + src1[6*i + 3];
2066
        int g= src1[6*i + 1] + src1[6*i + 4];
2067
        int r= src1[6*i + 2] + src1[6*i + 5];
2068

    
2069
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2070
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071
    }
2072
#endif
2073
    assert(src1 == src2);
2074
}
2075

    
2076
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2077
{
2078
    int i;
2079
    for (i=0; i<width; i++)
2080
    {
2081
        int d= ((uint16_t*)src)[i];
2082
        int b= d&0x1F;
2083
        int g= (d>>5)&0x3F;
2084
        int r= (d>>11)&0x1F;
2085

    
2086
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2087
    }
2088
}
2089

    
2090
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2091
{
2092
    int i;
2093
    assert(src1==src2);
2094
    for (i=0; i<width; i++)
2095
    {
2096
        int d0= ((uint32_t*)src1)[i];
2097

    
2098
        int dl= (d0&0x07E0F81F);
2099
        int dh= ((d0>>5)&0x07C0F83F);
2100

    
2101
        int dh2= (dh>>11) + (dh<<21);
2102
        int d= dh2 + dl;
2103

    
2104
        int b= d&0x7F;
2105
        int r= (d>>11)&0x7F;
2106
        int g= d>>21;
2107
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2108
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109
    }
2110
}
2111

    
2112
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2113
{
2114
    int i;
2115
    for (i=0; i<width; i++)
2116
    {
2117
        int d= ((uint16_t*)src)[i];
2118
        int b= d&0x1F;
2119
        int g= (d>>5)&0x1F;
2120
        int r= (d>>10)&0x1F;
2121

    
2122
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2123
    }
2124
}
2125

    
2126
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2127
{
2128
    int i;
2129
    assert(src1==src2);
2130
    for (i=0; i<width; i++)
2131
    {
2132
        int d0= ((uint32_t*)src1)[i];
2133

    
2134
        int dl= (d0&0x03E07C1F);
2135
        int dh= ((d0>>5)&0x03E0F81F);
2136

    
2137
        int dh2= (dh>>11) + (dh<<21);
2138
        int d= dh2 + dl;
2139

    
2140
        int b= d&0x7F;
2141
        int r= (d>>10)&0x7F;
2142
        int g= d>>21;
2143
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2144
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145
    }
2146
}
2147

    
2148

    
2149
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2150
{
2151
    int i;
2152
    for (i=0; i<width; i++)
2153
    {
2154
        int r=  ((uint32_t*)src)[i]&0xFF;
2155
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
2156
        int b= (((uint32_t*)src)[i]>>16)&0xFF;
2157

    
2158
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2159
    }
2160
}
2161

    
2162
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2163
{
2164
    int i;
2165
    assert(src1==src2);
2166
    for (i=0; i<width; i++)
2167
    {
2168
        const int a= ((uint32_t*)src1)[2*i+0];
2169
        const int e= ((uint32_t*)src1)[2*i+1];
2170
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
2171
        const int h= (a&0x00FF00) + (e&0x00FF00);
2172
        const int r=  l&0x3FF;
2173
        const int g=  h>>8;
2174
        const int b=  l>>16;
2175

    
2176
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2177
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178
    }
2179
}
2180

    
2181
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2182
{
2183
    int i;
2184
    for (i=0; i<width; i++)
2185
    {
2186
        int r= src[i*3+0];
2187
        int g= src[i*3+1];
2188
        int b= src[i*3+2];
2189

    
2190
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)) )>>RGB2YUV_SHIFT);
2191
    }
2192
}
2193

    
2194
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2195
{
2196
    int i;
2197
    assert(src1==src2);
2198
    for (i=0; i<width; i++)
2199
    {
2200
        int r= src1[6*i + 0] + src1[6*i + 3];
2201
        int g= src1[6*i + 1] + src1[6*i + 4];
2202
        int b= src1[6*i + 2] + src1[6*i + 5];
2203

    
2204
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2205
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206
    }
2207
}
2208

    
2209
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2210
{
2211
    int i;
2212
    for (i=0; i<width; i++)
2213
    {
2214
        int d= ((uint16_t*)src)[i];
2215
        int r= d&0x1F;
2216
        int g= (d>>5)&0x3F;
2217
        int b= (d>>11)&0x1F;
2218

    
2219
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2220
    }
2221
}
2222

    
2223
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2224
{
2225
    int i;
2226
    assert(src1 == src2);
2227
    for (i=0; i<width; i++)
2228
    {
2229
        int d0= ((uint32_t*)src1)[i];
2230

    
2231
        int dl= (d0&0x07E0F81F);
2232
        int dh= ((d0>>5)&0x07C0F83F);
2233

    
2234
        int dh2= (dh>>11) + (dh<<21);
2235
        int d= dh2 + dl;
2236

    
2237
        int r= d&0x7F;
2238
        int b= (d>>11)&0x7F;
2239
        int g= d>>21;
2240
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2241
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2242
    }
2243
}
2244

    
2245
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2246
{
2247
    int i;
2248
    for (i=0; i<width; i++)
2249
    {
2250
        int d= ((uint16_t*)src)[i];
2251
        int r= d&0x1F;
2252
        int g= (d>>5)&0x1F;
2253
        int b= (d>>10)&0x1F;
2254

    
2255
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2256
    }
2257
}
2258

    
2259
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2260
{
2261
    int i;
2262
    assert(src1 == src2);
2263
    for (i=0; i<width; i++)
2264
    {
2265
        int d0= ((uint32_t*)src1)[i];
2266

    
2267
        int dl= (d0&0x03E07C1F);
2268
        int dh= ((d0>>5)&0x03E0F81F);
2269

    
2270
        int dh2= (dh>>11) + (dh<<21);
2271
        int d= dh2 + dl;
2272

    
2273
        int g= d&0x7F;
2274
        int r= (d>>10)&0x7F;
2275
        int b= d>>21;
2276
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2277
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2278
    }
2279
}
2280

    
2281
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2282
{
2283
    int i;
2284
    for (i=0; i<width; i++)
2285
    {
2286
        int d= src[i];
2287

    
2288
        dst[i]= pal[d] & 0xFF;
2289
    }
2290
}
2291

    
2292
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2293
{
2294
    int i;
2295
    assert(src1 == src2);
2296
    for (i=0; i<width; i++)
2297
    {
2298
        int p= pal[src1[i]];
2299

    
2300
        dstU[i]= p>>8;
2301
        dstV[i]= p>>16;
2302
    }
2303
}
2304

    
2305
// Bilinear / Bicubic scaling
2306
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2307
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2308
{
2309
#ifdef HAVE_MMX
2310
    assert(filterSize % 4 == 0 && filterSize>0);
2311
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2312
    {
2313
        long counter= -2*dstW;
2314
        filter-= counter*2;
2315
        filterPos-= counter/2;
2316
        dst-= counter/2;
2317
        asm volatile(
2318
#if defined(PIC)
2319
        "push            %%"REG_b"              \n\t"
2320
#endif
2321
        "pxor                %%mm7, %%mm7       \n\t"
2322
        "movq        "MANGLE(w02)", %%mm6       \n\t"
2323
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2324
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2325
        ASMALIGN(4)
2326
        "1:                                     \n\t"
2327
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2328
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2329
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2330
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2331
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2332
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2333
        "punpcklbw           %%mm7, %%mm0       \n\t"
2334
        "punpcklbw           %%mm7, %%mm2       \n\t"
2335
        "pmaddwd             %%mm1, %%mm0       \n\t"
2336
        "pmaddwd             %%mm2, %%mm3       \n\t"
2337
        "psrad                  $8, %%mm0       \n\t"
2338
        "psrad                  $8, %%mm3       \n\t"
2339
        "packssdw            %%mm3, %%mm0       \n\t"
2340
        "pmaddwd             %%mm6, %%mm0       \n\t"
2341
        "packssdw            %%mm0, %%mm0       \n\t"
2342
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2343
        "add                    $4, %%"REG_BP"  \n\t"
2344
        " jnc                   1b              \n\t"
2345

    
2346
        "pop            %%"REG_BP"              \n\t"
2347
#if defined(PIC)
2348
        "pop             %%"REG_b"              \n\t"
2349
#endif
2350
        : "+a" (counter)
2351
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2352
#if !defined(PIC)
2353
        : "%"REG_b
2354
#endif
2355
        );
2356
    }
2357
    else if (filterSize==8)
2358
    {
2359
        long counter= -2*dstW;
2360
        filter-= counter*4;
2361
        filterPos-= counter/2;
2362
        dst-= counter/2;
2363
        asm volatile(
2364
#if defined(PIC)
2365
        "push             %%"REG_b"             \n\t"
2366
#endif
2367
        "pxor                 %%mm7, %%mm7      \n\t"
2368
        "movq         "MANGLE(w02)", %%mm6      \n\t"
2369
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2370
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2371
        ASMALIGN(4)
2372
        "1:                                     \n\t"
2373
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2374
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2375
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2376
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2377
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2378
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2379
        "punpcklbw            %%mm7, %%mm0      \n\t"
2380
        "punpcklbw            %%mm7, %%mm2      \n\t"
2381
        "pmaddwd              %%mm1, %%mm0      \n\t"
2382
        "pmaddwd              %%mm2, %%mm3      \n\t"
2383

    
2384
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2385
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2386
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2387
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2388
        "punpcklbw            %%mm7, %%mm4      \n\t"
2389
        "punpcklbw            %%mm7, %%mm2      \n\t"
2390
        "pmaddwd              %%mm1, %%mm4      \n\t"
2391
        "pmaddwd              %%mm2, %%mm5      \n\t"
2392
        "paddd                %%mm4, %%mm0      \n\t"
2393
        "paddd                %%mm5, %%mm3      \n\t"
2394

    
2395
        "psrad                   $8, %%mm0      \n\t"
2396
        "psrad                   $8, %%mm3      \n\t"
2397
        "packssdw             %%mm3, %%mm0      \n\t"
2398
        "pmaddwd              %%mm6, %%mm0      \n\t"
2399
        "packssdw             %%mm0, %%mm0      \n\t"
2400
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2401
        "add                     $4, %%"REG_BP" \n\t"
2402
        " jnc                    1b             \n\t"
2403

    
2404
        "pop             %%"REG_BP"             \n\t"
2405
#if defined(PIC)
2406
        "pop              %%"REG_b"             \n\t"
2407
#endif
2408
        : "+a" (counter)
2409
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2410
#if !defined(PIC)
2411
        : "%"REG_b
2412
#endif
2413
        );
2414
    }
2415
    else
2416
    {
2417
        uint8_t *offset = src+filterSize;
2418
        long counter= -2*dstW;
2419
        //filter-= counter*filterSize/2;
2420
        filterPos-= counter/2;
2421
        dst-= counter/2;
2422
        asm volatile(
2423
        "pxor                  %%mm7, %%mm7     \n\t"
2424
        "movq          "MANGLE(w02)", %%mm6     \n\t"
2425
        ASMALIGN(4)
2426
        "1:                                     \n\t"
2427
        "mov                      %2, %%"REG_c" \n\t"
2428
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2429
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2430
        "mov                      %5, %%"REG_c" \n\t"
2431
        "pxor                  %%mm4, %%mm4     \n\t"
2432
        "pxor                  %%mm5, %%mm5     \n\t"
2433
        "2:                                     \n\t"
2434
        "movq                   (%1), %%mm1     \n\t"
2435
        "movq               (%1, %6), %%mm3     \n\t"
2436
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2437
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2438
        "punpcklbw             %%mm7, %%mm0     \n\t"
2439
        "punpcklbw             %%mm7, %%mm2     \n\t"
2440
        "pmaddwd               %%mm1, %%mm0     \n\t"
2441
        "pmaddwd               %%mm2, %%mm3     \n\t"
2442
        "paddd                 %%mm3, %%mm5     \n\t"
2443
        "paddd                 %%mm0, %%mm4     \n\t"
2444
        "add                      $8, %1        \n\t"
2445
        "add                      $4, %%"REG_c" \n\t"
2446
        "cmp                      %4, %%"REG_c" \n\t"
2447
        " jb                      2b            \n\t"
2448
        "add                      %6, %1        \n\t"
2449
        "psrad                    $8, %%mm4     \n\t"
2450
        "psrad                    $8, %%mm5     \n\t"
2451
        "packssdw              %%mm5, %%mm4     \n\t"
2452
        "pmaddwd               %%mm6, %%mm4     \n\t"
2453
        "packssdw              %%mm4, %%mm4     \n\t"
2454
        "mov                      %3, %%"REG_a" \n\t"
2455
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2456
        "add                      $4, %0        \n\t"
2457
        " jnc                     1b            \n\t"
2458

    
2459
        : "+r" (counter), "+r" (filter)
2460
        : "m" (filterPos), "m" (dst), "m"(offset),
2461
          "m" (src), "r" (filterSize*2)
2462
        : "%"REG_a, "%"REG_c, "%"REG_d
2463
        );
2464
    }
2465
#else
2466
#ifdef HAVE_ALTIVEC
2467
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2468
#else
2469
    int i;
2470
    for (i=0; i<dstW; i++)
2471
    {
2472
        int j;
2473
        int srcPos= filterPos[i];
2474
        int val=0;
2475
        //printf("filterPos: %d\n", filterPos[i]);
2476
        for (j=0; j<filterSize; j++)
2477
        {
2478
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2479
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2480
        }
2481
        //filter += hFilterSize;
2482
        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2483
        //dst[i] = val>>7;
2484
    }
2485
#endif
2486
#endif
2487
}
2488
      // *** horizontal scale Y line to temp buffer
2489
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2490
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2491
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2492
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2493
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2494
{
2495
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2496
    {
2497
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2498
        src= formatConvBuffer;
2499
    }
2500
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2501
    {
2502
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2503
        src= formatConvBuffer;
2504
    }
2505
    else if (srcFormat==PIX_FMT_RGB32)
2506
    {
2507
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2508
        src= formatConvBuffer;
2509
    }
2510
    else if (srcFormat==PIX_FMT_BGR24)
2511
    {
2512
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2513
        src= formatConvBuffer;
2514
    }
2515
    else if (srcFormat==PIX_FMT_BGR565)
2516
    {
2517
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2518
        src= formatConvBuffer;
2519
    }
2520
    else if (srcFormat==PIX_FMT_BGR555)
2521
    {
2522
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2523
        src= formatConvBuffer;
2524
    }
2525
    else if (srcFormat==PIX_FMT_BGR32)
2526
    {
2527
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2528
        src= formatConvBuffer;
2529
    }
2530
    else if (srcFormat==PIX_FMT_RGB24)
2531
    {
2532
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2533
        src= formatConvBuffer;
2534
    }
2535
    else if (srcFormat==PIX_FMT_RGB565)
2536
    {
2537
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2538
        src= formatConvBuffer;
2539
    }
2540
    else if (srcFormat==PIX_FMT_RGB555)
2541
    {
2542
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2543
        src= formatConvBuffer;
2544
    }
2545
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2546
    {
2547
        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2548
        src= formatConvBuffer;
2549
    }
2550

    
2551
#ifdef HAVE_MMX
2552
    // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2553
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2554
#else
2555
    if (!(flags&SWS_FAST_BILINEAR))
2556
#endif
2557
    {
2558
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2559
    }
2560
    else // Fast Bilinear upscale / crap downscale
2561
    {
2562
#if defined(ARCH_X86)
2563
#ifdef HAVE_MMX2
2564
        int i;
2565
#if defined(PIC)
2566
        uint64_t ebxsave __attribute__((aligned(8)));
2567
#endif
2568
        if (canMMX2BeUsed)
2569
        {
2570
            asm volatile(
2571
#if defined(PIC)
2572
            "mov               %%"REG_b", %5        \n\t"
2573
#endif
2574
            "pxor                  %%mm7, %%mm7     \n\t"
2575
            "mov                      %0, %%"REG_c" \n\t"
2576
            "mov                      %1, %%"REG_D" \n\t"
2577
            "mov                      %2, %%"REG_d" \n\t"
2578
            "mov                      %3, %%"REG_b" \n\t"
2579
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2580
            PREFETCH"        (%%"REG_c")            \n\t"
2581
            PREFETCH"      32(%%"REG_c")            \n\t"
2582
            PREFETCH"      64(%%"REG_c")            \n\t"
2583

    
2584
#ifdef ARCH_X86_64
2585

    
2586
#define FUNNY_Y_CODE \
2587
            "movl            (%%"REG_b"), %%esi     \n\t"\
2588
            "call                    *%4            \n\t"\
2589
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2590
            "add               %%"REG_S", %%"REG_c" \n\t"\
2591
            "add               %%"REG_a", %%"REG_D" \n\t"\
2592
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2593

    
2594
#else
2595

    
2596
#define FUNNY_Y_CODE \
2597
            "movl (%%"REG_b"), %%esi        \n\t"\
2598
            "call         *%4                       \n\t"\
2599
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2600
            "add               %%"REG_a", %%"REG_D" \n\t"\
2601
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2602

    
2603
#endif
2604

    
2605
FUNNY_Y_CODE
2606
FUNNY_Y_CODE
2607
FUNNY_Y_CODE
2608
FUNNY_Y_CODE
2609
FUNNY_Y_CODE
2610
FUNNY_Y_CODE
2611
FUNNY_Y_CODE
2612
FUNNY_Y_CODE
2613

    
2614
#if defined(PIC)
2615
            "mov                      %5, %%"REG_b" \n\t"
2616
#endif
2617
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2618
            "m" (funnyYCode)
2619
#if defined(PIC)
2620
            ,"m" (ebxsave)
2621
#endif
2622
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2623
#if !defined(PIC)
2624
            ,"%"REG_b
2625
#endif
2626
            );
2627
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2628
        }
2629
        else
2630
        {
2631
#endif
2632
        long xInc_shr16 = xInc >> 16;
2633
        uint16_t xInc_mask = xInc & 0xffff;
2634
        //NO MMX just normal asm ...
2635
        asm volatile(
2636
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2637
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2638
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2639
        ASMALIGN(4)
2640
        "1:                                  \n\t"
2641
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2642
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2643
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2644
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2645
        "shll      $16, %%edi                \n\t"
2646
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2647
        "mov        %1, %%"REG_D"            \n\t"
2648
        "shrl       $9, %%esi                \n\t"
2649
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2650
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2651
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2652

    
2653
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2654
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2655
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2656
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2657
        "shll      $16, %%edi                \n\t"
2658
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2659
        "mov        %1, %%"REG_D"            \n\t"
2660
        "shrl       $9, %%esi                \n\t"
2661
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2662
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2663
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2664

    
2665

    
2666
        "add        $2, %%"REG_a"            \n\t"
2667
        "cmp        %2, %%"REG_a"            \n\t"
2668
        " jb        1b                       \n\t"
2669

    
2670

    
2671
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2672
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2673
        );
2674
#ifdef HAVE_MMX2
2675
        } //if MMX2 can't be used
2676
#endif
2677
#else
2678
        int i;
2679
        unsigned int xpos=0;
2680
        for (i=0;i<dstWidth;i++)
2681
        {
2682
            register unsigned int xx=xpos>>16;
2683
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2684
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2685
            xpos+=xInc;
2686
        }
2687
#endif
2688
    }
2689
}
2690

    
2691
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2692
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2693
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2694
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2695
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2696
{
2697
    if (srcFormat==PIX_FMT_YUYV422)
2698
    {
2699
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2700
        src1= formatConvBuffer;
2701
        src2= formatConvBuffer+2048;
2702
    }
2703
    else if (srcFormat==PIX_FMT_UYVY422)
2704
    {
2705
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2706
        src1= formatConvBuffer;
2707
        src2= formatConvBuffer+2048;
2708
    }
2709
    else if (srcFormat==PIX_FMT_RGB32)
2710
    {
2711
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2712
        src1= formatConvBuffer;
2713
        src2= formatConvBuffer+2048;
2714
    }
2715
    else if (srcFormat==PIX_FMT_BGR24)
2716
    {
2717
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2718
        src1= formatConvBuffer;
2719
        src2= formatConvBuffer+2048;
2720
    }
2721
    else if (srcFormat==PIX_FMT_BGR565)
2722
    {
2723
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2724
        src1= formatConvBuffer;
2725
        src2= formatConvBuffer+2048;
2726
    }
2727
    else if (srcFormat==PIX_FMT_BGR555)
2728
    {
2729
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2730
        src1= formatConvBuffer;
2731
        src2= formatConvBuffer+2048;
2732
    }
2733
    else if (srcFormat==PIX_FMT_BGR32)
2734
    {
2735
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2736
        src1= formatConvBuffer;
2737
        src2= formatConvBuffer+2048;
2738
    }
2739
    else if (srcFormat==PIX_FMT_RGB24)
2740
    {
2741
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2742
        src1= formatConvBuffer;
2743
        src2= formatConvBuffer+2048;
2744
    }
2745
    else if (srcFormat==PIX_FMT_RGB565)
2746
    {
2747
        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2748
        src1= formatConvBuffer;
2749
        src2= formatConvBuffer+2048;
2750
    }
2751
    else if (srcFormat==PIX_FMT_RGB555)
2752
    {
2753
        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2754
        src1= formatConvBuffer;
2755
        src2= formatConvBuffer+2048;
2756
    }
2757
    else if (isGray(srcFormat))
2758
    {
2759
        return;
2760
    }
2761
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2762
    {
2763
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2764
        src1= formatConvBuffer;
2765
        src2= formatConvBuffer+2048;
2766
    }
2767

    
2768
#ifdef HAVE_MMX
2769
    // use the new MMX scaler if the mmx2 can't be used (its faster than the x86asm one)
2770
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2771
#else
2772
    if (!(flags&SWS_FAST_BILINEAR))
2773
#endif
2774
    {
2775
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2776
        RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2777
    }
2778
    else // Fast Bilinear upscale / crap downscale
2779
    {
2780
#if defined(ARCH_X86)
2781
#ifdef HAVE_MMX2
2782
        int i;
2783
#if defined(PIC)
2784
        uint64_t ebxsave __attribute__((aligned(8)));
2785
#endif
2786
        if (canMMX2BeUsed)
2787
        {
2788
            asm volatile(
2789
#if defined(PIC)
2790
            "mov          %%"REG_b", %6         \n\t"
2791
#endif
2792
            "pxor             %%mm7, %%mm7      \n\t"
2793
            "mov                 %0, %%"REG_c"  \n\t"
2794
            "mov                 %1, %%"REG_D"  \n\t"
2795
            "mov                 %2, %%"REG_d"  \n\t"
2796
            "mov                 %3, %%"REG_b"  \n\t"
2797
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2798
            PREFETCH"   (%%"REG_c")             \n\t"
2799
            PREFETCH" 32(%%"REG_c")             \n\t"
2800
            PREFETCH" 64(%%"REG_c")             \n\t"
2801

    
2802
#ifdef ARCH_X86_64
2803

    
2804
#define FUNNY_UV_CODE \
2805
            "movl       (%%"REG_b"), %%esi      \n\t"\
2806
            "call               *%4             \n\t"\
2807
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2808
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2809
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2810
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2811

    
2812
#else
2813

    
2814
#define FUNNY_UV_CODE \
2815
            "movl       (%%"REG_b"), %%esi      \n\t"\
2816
            "call               *%4             \n\t"\
2817
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2818
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2819
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2820

    
2821
#endif
2822

    
2823
FUNNY_UV_CODE
2824
FUNNY_UV_CODE
2825
FUNNY_UV_CODE
2826
FUNNY_UV_CODE
2827
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2828
            "mov                 %5, %%"REG_c"  \n\t" // src
2829
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2830
            "add              $4096, %%"REG_D"  \n\t"
2831
            PREFETCH"   (%%"REG_c")             \n\t"
2832
            PREFETCH" 32(%%"REG_c")             \n\t"
2833
            PREFETCH" 64(%%"REG_c")             \n\t"
2834

    
2835
FUNNY_UV_CODE
2836
FUNNY_UV_CODE
2837
FUNNY_UV_CODE
2838
FUNNY_UV_CODE
2839

    
2840
#if defined(PIC)
2841
            "mov %6, %%"REG_b"    \n\t"
2842
#endif
2843
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2844
            "m" (funnyUVCode), "m" (src2)
2845
#if defined(PIC)
2846
            ,"m" (ebxsave)
2847
#endif
2848
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2849
#if !defined(PIC)
2850
             ,"%"REG_b
2851
#endif
2852
            );
2853
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2854
            {
2855
                //printf("%d %d %d\n", dstWidth, i, srcW);
2856
                dst[i] = src1[srcW-1]*128;
2857
                dst[i+2048] = src2[srcW-1]*128;
2858
            }
2859
        }
2860
        else
2861
        {
2862
#endif
2863
            long xInc_shr16 = (long) (xInc >> 16);
2864
            uint16_t xInc_mask = xInc & 0xffff;
2865
            asm volatile(
2866
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2867
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2868
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2869
            ASMALIGN(4)
2870
            "1:                                     \n\t"
2871
            "mov        %0, %%"REG_S"               \n\t"
2872
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2873
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2874
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2875
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2876
            "shll      $16, %%edi                   \n\t"
2877
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2878
            "mov        %1, %%"REG_D"               \n\t"
2879
            "shrl       $9, %%esi                   \n\t"
2880
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2881

    
2882
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2883
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2884
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2885
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2886
            "shll      $16, %%edi                   \n\t"
2887
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2888
            "mov        %1, %%"REG_D"               \n\t"
2889
            "shrl       $9, %%esi                   \n\t"
2890
            "movw     %%si, 4096(%%"REG_D", %%"REG_a", 2)   \n\t"
2891

    
2892
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2893
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2894
            "add        $1, %%"REG_a"               \n\t"
2895
            "cmp        %2, %%"REG_a"               \n\t"
2896
            " jb        1b                          \n\t"
2897

    
2898
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2899
   which is needed to support GCC-4.0 */
2900
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || ( __GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2901
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2902
#else
2903
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2904
#endif
2905
            "r" (src2)
2906
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2907
            );
2908
#ifdef HAVE_MMX2
2909
        } //if MMX2 can't be used
2910
#endif
2911
#else
2912
        int i;
2913
        unsigned int xpos=0;
2914
        for (i=0;i<dstWidth;i++)
2915
        {
2916
            register unsigned int xx=xpos>>16;
2917
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2918
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2919
            dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2920
            /* slower
2921
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2922
            dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2923
            */
2924
            xpos+=xInc;
2925
        }
2926
#endif
2927
    }
2928
}
2929

    
2930
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2931
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2932

    
2933
    /* load a few things into local vars to make the code more readable? and faster */
2934
    const int srcW= c->srcW;
2935
    const int dstW= c->dstW;
2936
    const int dstH= c->dstH;
2937
    const int chrDstW= c->chrDstW;
2938
    const int chrSrcW= c->chrSrcW;
2939
    const int lumXInc= c->lumXInc;
2940
    const int chrXInc= c->chrXInc;
2941
    const int dstFormat= c->dstFormat;
2942
    const int srcFormat= c->srcFormat;
2943
    const int flags= c->flags;
2944
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2945
    int16_t *vLumFilterPos= c->vLumFilterPos;
2946
    int16_t *vChrFilterPos= c->vChrFilterPos;
2947
    int16_t *hLumFilterPos= c->hLumFilterPos;
2948
    int16_t *hChrFilterPos= c->hChrFilterPos;
2949
    int16_t *vLumFilter= c->vLumFilter;
2950
    int16_t *vChrFilter= c->vChrFilter;
2951
    int16_t *hLumFilter= c->hLumFilter;
2952
    int16_t *hChrFilter= c->hChrFilter;
2953
    int32_t *lumMmxFilter= c->lumMmxFilter;
2954
    int32_t *chrMmxFilter= c->chrMmxFilter;
2955
    const int vLumFilterSize= c->vLumFilterSize;
2956
    const int vChrFilterSize= c->vChrFilterSize;
2957
    const int hLumFilterSize= c->hLumFilterSize;
2958
    const int hChrFilterSize= c->hChrFilterSize;
2959
    int16_t **lumPixBuf= c->lumPixBuf;
2960
    int16_t **chrPixBuf= c->chrPixBuf;
2961
    const int vLumBufSize= c->vLumBufSize;
2962
    const int vChrBufSize= c->vChrBufSize;
2963
    uint8_t *funnyYCode= c->funnyYCode;
2964
    uint8_t *funnyUVCode= c->funnyUVCode;
2965
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2966
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2967
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2968
    int lastDstY;
2969
    uint8_t *pal=NULL;
2970

    
2971
    /* vars whch will change and which we need to storw back in the context */
2972
    int dstY= c->dstY;
2973
    int lumBufIndex= c->lumBufIndex;
2974
    int chrBufIndex= c->chrBufIndex;
2975
    int lastInLumBuf= c->lastInLumBuf;
2976
    int lastInChrBuf= c->lastInChrBuf;
2977

    
2978
    if (isPacked(c->srcFormat)){
2979
        pal= src[1];
2980
        src[0]=
2981
        src[1]=
2982
        src[2]= src[0];
2983
        srcStride[0]=
2984
        srcStride[1]=
2985
        srcStride[2]= srcStride[0];
2986
    }
2987
    srcStride[1]<<= c->vChrDrop;
2988
    srcStride[2]<<= c->vChrDrop;
2989

    
2990
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2991
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2992

    
2993
#if 0 //self test FIXME move to a vfilter or something
2994
    {
2995
    static volatile int i=0;
2996
    i++;
2997
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2998
        selfTest(src, srcStride, c->srcW, c->srcH);
2999
    i--;
3000
    }
3001
#endif
3002

    
3003
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
3004
    //dstStride[0],dstStride[1],dstStride[2]);
3005

    
3006
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3007
    {
3008
        static int firstTime=1; //FIXME move this into the context perhaps
3009
        if (flags & SWS_PRINT_INFO && firstTime)
3010
        {
3011
            av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
3012
                   "SwScaler:          ->cannot do aligned memory acesses anymore\n");
3013
            firstTime=0;
3014
        }
3015
    }
3016

    
3017
    /* Note the user might start scaling the picture in the middle so this will not get executed
3018
       this is not really intended but works currently, so ppl might do it */
3019
    if (srcSliceY ==0){
3020
        lumBufIndex=0;
3021
        chrBufIndex=0;
3022
        dstY=0;
3023
        lastInLumBuf= -1;
3024
        lastInChrBuf= -1;
3025
    }
3026

    
3027
    lastDstY= dstY;
3028

    
3029
    for (;dstY < dstH; dstY++){
3030
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
3031
        const int chrDstY= dstY>>c->chrDstVSubSample;
3032
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3033
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3034

    
3035
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3036
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3037
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3038
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3039

    
3040
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3041
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3042
        //handle holes (FAST_BILINEAR & weird filters)
3043
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3044
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3045
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3046
        ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3047
        ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3048

    
3049
        // Do we have enough lines in this slice to output the dstY line
3050
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3051
        {
3052
            //Do horizontal scaling
3053
            while(lastInLumBuf < lastLumSrcY)
3054
            {
3055
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3056
                lumBufIndex++;
3057
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3058
                ASSERT(lumBufIndex < 2*vLumBufSize)
3059
                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3060
                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3061
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3062
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3063
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3064
                                funnyYCode, c->srcFormat, formatConvBuffer,
3065
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3066
                lastInLumBuf++;
3067
            }
3068
            while(lastInChrBuf < lastChrSrcY)
3069
            {
3070
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3071
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3072
                chrBufIndex++;
3073
                ASSERT(chrBufIndex < 2*vChrBufSize)
3074
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3075
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3076
                //FIXME replace parameters through context struct (some at least)
3077

    
3078
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3079
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3080
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3081
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3082
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3083
                lastInChrBuf++;
3084
            }
3085
            //wrap buf index around to stay inside the ring buffer
3086
            if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3087
            if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3088
        }
3089
        else // not enough lines left in this slice -> load the rest in the buffer
3090
        {
3091
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3092
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3093
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3094
            vChrBufSize, vLumBufSize);*/
3095

    
3096
            //Do horizontal scaling
3097
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3098
            {
3099
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3100
                lumBufIndex++;
3101
                ASSERT(lumBufIndex < 2*vLumBufSize)
3102
                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3103
                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3104
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3105
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3106
                                funnyYCode, c->srcFormat, formatConvBuffer,
3107
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3108
                lastInLumBuf++;
3109
            }
3110
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3111
            {
3112
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3113
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3114
                chrBufIndex++;
3115
                ASSERT(chrBufIndex < 2*vChrBufSize)
3116
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3117
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3118

    
3119
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3120
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3121
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3122
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3123
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3124
                lastInChrBuf++;
3125
            }
3126
            //wrap buf index around to stay inside the ring buffer
3127
            if (lumBufIndex >= vLumBufSize ) lumBufIndex-= vLumBufSize;
3128
            if (chrBufIndex >= vChrBufSize ) chrBufIndex-= vChrBufSize;
3129
            break; //we can't output a dstY line so let's try with the next slice
3130
        }
3131

    
3132
#ifdef HAVE_MMX
3133
        b5Dither= dither8[dstY&1];
3134
        g6Dither= dither4[dstY&1];
3135
        g5Dither= dither8[dstY&1];
3136
        r5Dither= dither8[(dstY+1)&1];
3137
#endif
3138
        if (dstY < dstH-2)
3139
        {
3140
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3141
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3142
#ifdef HAVE_MMX
3143
            int i;
3144
        if (flags & SWS_ACCURATE_RND){
3145
            for (i=0; i<vLumFilterSize; i+=2){
3146
                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
3147
                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3148
                lumMmxFilter[2*i+2]=
3149
                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3150
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3151
            }
3152
            for (i=0; i<vChrFilterSize; i+=2){
3153
                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
3154
                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3155
                chrMmxFilter[2*i+2]=
3156
                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3157
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3158
            }
3159
        }else{
3160
            for (i=0; i<vLumFilterSize; i++)
3161
            {
3162
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3163
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3164
                lumMmxFilter[4*i+2]=
3165
                lumMmxFilter[4*i+3]=
3166
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3167
            }
3168
            for (i=0; i<vChrFilterSize; i++)
3169
            {
3170
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3171
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3172
                chrMmxFilter[4*i+2]=
3173
                chrMmxFilter[4*i+3]=
3174
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3175
            }
3176
        }
3177
#endif
3178
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3179
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3180
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3181
                RENAME(yuv2nv12X)(c,
3182
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3183
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3184
                    dest, uDest, dstW, chrDstW, dstFormat);
3185
            }
3186
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3187
            {
3188
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3189
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3190
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3191
                {
3192
                    int16_t *lumBuf = lumPixBuf[0];
3193
                    int16_t *chrBuf= chrPixBuf[0];
3194
                    RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3195
                }
3196
                else //General YV12
3197
                {
3198
                    RENAME(yuv2yuvX)(c,
3199
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3200
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3201
                        dest, uDest, vDest, dstW, chrDstW);
3202
                }
3203
            }
3204
            else
3205
            {
3206
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3207
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3208
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3209
                {
3210
                    int chrAlpha= vChrFilter[2*dstY+1];
3211
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3212
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3213
                }
3214
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3215
                {
3216
                    int lumAlpha= vLumFilter[2*dstY+1];
3217
                    int chrAlpha= vChrFilter[2*dstY+1];
3218
                    lumMmxFilter[2]=
3219
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3220
                    chrMmxFilter[2]=
3221
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3222
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3223
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3224
                }
3225
                else //General RGB
3226
                {
3227
                    RENAME(yuv2packedX)(c,
3228
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3229
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3230
                        dest, dstW, dstY);
3231
                }
3232
            }
3233
        }
3234
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3235
        {
3236
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3237
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3238
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3239
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3240
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3241
                yuv2nv12XinC(
3242
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3243
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3244
                    dest, uDest, dstW, chrDstW, dstFormat);
3245
            }
3246
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3247
            {
3248
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3249
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3250
                yuv2yuvXinC(
3251
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3252
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3253
                    dest, uDest, vDest, dstW, chrDstW);
3254
            }
3255
            else
3256
            {
3257
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3258
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3259
                yuv2packedXinC(c,
3260
                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3261
                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3262
                    dest, dstW, dstY);
3263
            }
3264
        }
3265
    }
3266

    
3267
#ifdef HAVE_MMX
3268
    __asm __volatile(SFENCE:::"memory");
3269
    __asm __volatile(EMMS:::"memory");
3270
#endif
3271
    /* store changed local vars back in the context */
3272
    c->dstY= dstY;
3273
    c->lumBufIndex= lumBufIndex;
3274
    c->chrBufIndex= chrBufIndex;
3275
    c->lastInLumBuf= lastInLumBuf;
3276
    c->lastInChrBuf= lastInChrBuf;
3277

    
3278
    return dstY - lastDstY;
3279
}