Statistics
| Branch: | Revision:

ffmpeg / libswscale / swscale_template.c @ ad40b153

History | View | Annotate | Download (133 KB)

1
/*
2
 * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation; either version 2 of the License, or
9
 * (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
 * GNU General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU General Public License
17
 * along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 *
20
 * the C code (not assembly, mmx, ...) of this file can be used
21
 * under the LGPL license too
22
 */
23

    
24
#undef REAL_MOVNTQ
25
#undef MOVNTQ
26
#undef PAVGB
27
#undef PREFETCH
28
#undef PREFETCHW
29
#undef EMMS
30
#undef SFENCE
31

    
32
#ifdef HAVE_3DNOW
33
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
34
#define EMMS     "femms"
35
#else
36
#define EMMS     "emms"
37
#endif
38

    
39
#ifdef HAVE_3DNOW
40
#define PREFETCH  "prefetch"
41
#define PREFETCHW "prefetchw"
42
#elif defined (HAVE_MMX2)
43
#define PREFETCH "prefetchnta"
44
#define PREFETCHW "prefetcht0"
45
#else
46
#define PREFETCH  " # nop"
47
#define PREFETCHW " # nop"
48
#endif
49

    
50
#ifdef HAVE_MMX2
51
#define SFENCE "sfence"
52
#else
53
#define SFENCE " # nop"
54
#endif
55

    
56
#ifdef HAVE_MMX2
57
#define PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
58
#elif defined (HAVE_3DNOW)
59
#define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
60
#endif
61

    
62
#ifdef HAVE_MMX2
63
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
64
#else
65
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
66
#endif
67
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
68

    
69
#ifdef HAVE_ALTIVEC
70
#include "swscale_altivec_template.c"
71
#endif
72

    
73
#define YSCALEYUV2YV12X(x, offset, dest, width) \
74
    asm volatile(\
75
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
76
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
77
    "movq                             %%mm3, %%mm4      \n\t"\
78
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
79
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
80
    ASMALIGN(4) /* FIXME Unroll? */\
81
    "1:                                                 \n\t"\
82
    "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
83
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
84
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm5      \n\t" /* srcData */\
85
    "add                                $16, %%"REG_d"  \n\t"\
86
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
87
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
88
    "pmulhw                           %%mm0, %%mm2      \n\t"\
89
    "pmulhw                           %%mm0, %%mm5      \n\t"\
90
    "paddw                            %%mm2, %%mm3      \n\t"\
91
    "paddw                            %%mm5, %%mm4      \n\t"\
92
    " jnz                                1b             \n\t"\
93
    "psraw                               $3, %%mm3      \n\t"\
94
    "psraw                               $3, %%mm4      \n\t"\
95
    "packuswb                         %%mm4, %%mm3      \n\t"\
96
    MOVNTQ(%%mm3, (%1, %%REGa))\
97
    "add                                 $8, %%"REG_a"  \n\t"\
98
    "cmp                                 %2, %%"REG_a"  \n\t"\
99
    "movq             "VROUNDER_OFFSET"(%0), %%mm3      \n\t"\
100
    "movq                             %%mm3, %%mm4      \n\t"\
101
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
102
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
103
    "jb                                  1b             \n\t"\
104
    :: "r" (&c->redDither),\
105
    "r" (dest), "g" (width)\
106
    : "%"REG_a, "%"REG_d, "%"REG_S\
107
    );
108

    
109
#define YSCALEYUV2YV12X_ACCURATE(x, offset, dest, width) \
110
    asm volatile(\
111
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
112
    "xor                          %%"REG_a", %%"REG_a"  \n\t"\
113
    "pxor                             %%mm4, %%mm4      \n\t"\
114
    "pxor                             %%mm5, %%mm5      \n\t"\
115
    "pxor                             %%mm6, %%mm6      \n\t"\
116
    "pxor                             %%mm7, %%mm7      \n\t"\
117
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
118
    ASMALIGN(4) \
119
    "1:                                                 \n\t"\
120
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm0      \n\t" /* srcData */\
121
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm2      \n\t" /* srcData */\
122
    "mov                       4(%%"REG_d"), %%"REG_S"  \n\t"\
123
    "movq   " #x "(%%"REG_S", %%"REG_a", 2), %%mm1      \n\t" /* srcData */\
124
    "movq                             %%mm0, %%mm3      \n\t"\
125
    "punpcklwd                        %%mm1, %%mm0      \n\t"\
126
    "punpckhwd                        %%mm1, %%mm3      \n\t"\
127
    "movq                      8(%%"REG_d"), %%mm1      \n\t" /* filterCoeff */\
128
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
129
    "pmaddwd                          %%mm1, %%mm3      \n\t"\
130
    "paddd                            %%mm0, %%mm4      \n\t"\
131
    "paddd                            %%mm3, %%mm5      \n\t"\
132
    "movq 8+" #x "(%%"REG_S", %%"REG_a", 2), %%mm3      \n\t" /* srcData */\
133
    "mov                      16(%%"REG_d"), %%"REG_S"  \n\t"\
134
    "add                                $16, %%"REG_d"  \n\t"\
135
    "test                         %%"REG_S", %%"REG_S"  \n\t"\
136
    "movq                             %%mm2, %%mm0      \n\t"\
137
    "punpcklwd                        %%mm3, %%mm2      \n\t"\
138
    "punpckhwd                        %%mm3, %%mm0      \n\t"\
139
    "pmaddwd                          %%mm1, %%mm2      \n\t"\
140
    "pmaddwd                          %%mm1, %%mm0      \n\t"\
141
    "paddd                            %%mm2, %%mm6      \n\t"\
142
    "paddd                            %%mm0, %%mm7      \n\t"\
143
    " jnz                                1b             \n\t"\
144
    "psrad                              $16, %%mm4      \n\t"\
145
    "psrad                              $16, %%mm5      \n\t"\
146
    "psrad                              $16, %%mm6      \n\t"\
147
    "psrad                              $16, %%mm7      \n\t"\
148
    "movq             "VROUNDER_OFFSET"(%0), %%mm0      \n\t"\
149
    "packssdw                         %%mm5, %%mm4      \n\t"\
150
    "packssdw                         %%mm7, %%mm6      \n\t"\
151
    "paddw                            %%mm0, %%mm4      \n\t"\
152
    "paddw                            %%mm0, %%mm6      \n\t"\
153
    "psraw                               $3, %%mm4      \n\t"\
154
    "psraw                               $3, %%mm6      \n\t"\
155
    "packuswb                         %%mm6, %%mm4      \n\t"\
156
    MOVNTQ(%%mm4, (%1, %%REGa))\
157
    "add                                 $8, %%"REG_a"  \n\t"\
158
    "cmp                                 %2, %%"REG_a"  \n\t"\
159
    "lea                     " offset "(%0), %%"REG_d"  \n\t"\
160
    "pxor                             %%mm4, %%mm4      \n\t"\
161
    "pxor                             %%mm5, %%mm5      \n\t"\
162
    "pxor                             %%mm6, %%mm6      \n\t"\
163
    "pxor                             %%mm7, %%mm7      \n\t"\
164
    "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
165
    "jb                                  1b             \n\t"\
166
    :: "r" (&c->redDither),\
167
    "r" (dest), "g" (width)\
168
    : "%"REG_a, "%"REG_d, "%"REG_S\
169
    );
170

    
171
#define YSCALEYUV2YV121 \
172
    "mov %2, %%"REG_a"                    \n\t"\
173
    ASMALIGN(4) /* FIXME Unroll? */\
174
    "1:                                   \n\t"\
175
    "movq  (%0, %%"REG_a", 2), %%mm0      \n\t"\
176
    "movq 8(%0, %%"REG_a", 2), %%mm1      \n\t"\
177
    "psraw                 $7, %%mm0      \n\t"\
178
    "psraw                 $7, %%mm1      \n\t"\
179
    "packuswb           %%mm1, %%mm0      \n\t"\
180
    MOVNTQ(%%mm0, (%1, %%REGa))\
181
    "add                   $8, %%"REG_a"  \n\t"\
182
    "jnc                   1b             \n\t"
183

    
184
/*
185
    :: "m" (-lumFilterSize), "m" (-chrFilterSize),
186
       "m" (lumMmxFilter+lumFilterSize*4), "m" (chrMmxFilter+chrFilterSize*4),
187
       "r" (dest), "m" (dstW),
188
       "m" (lumSrc+lumFilterSize), "m" (chrSrc+chrFilterSize)
189
    : "%eax", "%ebx", "%ecx", "%edx", "%esi"
190
*/
191
#define YSCALEYUV2PACKEDX \
192
    asm volatile(\
193
    "xor                   %%"REG_a", %%"REG_a"     \n\t"\
194
    ASMALIGN(4)\
195
    "nop                                            \n\t"\
196
    "1:                                             \n\t"\
197
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
198
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
199
    "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
200
    "movq                      %%mm3, %%mm4         \n\t"\
201
    ASMALIGN(4)\
202
    "2:                                             \n\t"\
203
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
204
    "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
205
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
206
    "add                         $16, %%"REG_d"     \n\t"\
207
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
208
    "pmulhw                    %%mm0, %%mm2         \n\t"\
209
    "pmulhw                    %%mm0, %%mm5         \n\t"\
210
    "paddw                     %%mm2, %%mm3         \n\t"\
211
    "paddw                     %%mm5, %%mm4         \n\t"\
212
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
213
    " jnz                         2b                \n\t"\
214
\
215
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
216
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
217
    "movq      "VROUNDER_OFFSET"(%0), %%mm1         \n\t"\
218
    "movq                      %%mm1, %%mm7         \n\t"\
219
    ASMALIGN(4)\
220
    "2:                                             \n\t"\
221
    "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
222
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y1srcData */\
223
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm5         \n\t" /* Y2srcData */\
224
    "add                         $16, %%"REG_d"            \n\t"\
225
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
226
    "pmulhw                    %%mm0, %%mm2         \n\t"\
227
    "pmulhw                    %%mm0, %%mm5         \n\t"\
228
    "paddw                     %%mm2, %%mm1         \n\t"\
229
    "paddw                     %%mm5, %%mm7         \n\t"\
230
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
231
    " jnz                         2b                \n\t"\
232

    
233
#define YSCALEYUV2PACKEDX_END                 \
234
    :: "r" (&c->redDither),                   \
235
        "m" (dummy), "m" (dummy), "m" (dummy),\
236
        "r" (dest), "m" (dstW)                \
237
    : "%"REG_a, "%"REG_d, "%"REG_S            \
238
    );
239

    
240
#define YSCALEYUV2PACKEDX_ACCURATE \
241
    asm volatile(\
242
    "xor %%"REG_a", %%"REG_a"                       \n\t"\
243
    ASMALIGN(4)\
244
    "nop                                            \n\t"\
245
    "1:                                             \n\t"\
246
    "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
247
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
248
    "pxor                      %%mm4, %%mm4         \n\t"\
249
    "pxor                      %%mm5, %%mm5         \n\t"\
250
    "pxor                      %%mm6, %%mm6         \n\t"\
251
    "pxor                      %%mm7, %%mm7         \n\t"\
252
    ASMALIGN(4)\
253
    "2:                                             \n\t"\
254
    "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
255
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
256
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
257
    "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
258
    "movq                      %%mm0, %%mm3         \n\t"\
259
    "punpcklwd                 %%mm1, %%mm0         \n\t"\
260
    "punpckhwd                 %%mm1, %%mm3         \n\t"\
261
    "movq               8(%%"REG_d"), %%mm1         \n\t" /* filterCoeff */\
262
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
263
    "pmaddwd                   %%mm1, %%mm3         \n\t"\
264
    "paddd                     %%mm0, %%mm4         \n\t"\
265
    "paddd                     %%mm3, %%mm5         \n\t"\
266
    "movq 4096(%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
267
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
268
    "add                         $16, %%"REG_d"     \n\t"\
269
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
270
    "movq                      %%mm2, %%mm0         \n\t"\
271
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
272
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
273
    "pmaddwd                   %%mm1, %%mm2         \n\t"\
274
    "pmaddwd                   %%mm1, %%mm0         \n\t"\
275
    "paddd                     %%mm2, %%mm6         \n\t"\
276
    "paddd                     %%mm0, %%mm7         \n\t"\
277
    " jnz                         2b                \n\t"\
278
    "psrad                       $16, %%mm4         \n\t"\
279
    "psrad                       $16, %%mm5         \n\t"\
280
    "psrad                       $16, %%mm6         \n\t"\
281
    "psrad                       $16, %%mm7         \n\t"\
282
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
283
    "packssdw                  %%mm5, %%mm4         \n\t"\
284
    "packssdw                  %%mm7, %%mm6         \n\t"\
285
    "paddw                     %%mm0, %%mm4         \n\t"\
286
    "paddw                     %%mm0, %%mm6         \n\t"\
287
    "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
288
    "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
289
\
290
    "lea "LUM_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
291
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
292
    "pxor                      %%mm1, %%mm1         \n\t"\
293
    "pxor                      %%mm5, %%mm5         \n\t"\
294
    "pxor                      %%mm7, %%mm7         \n\t"\
295
    "pxor                      %%mm6, %%mm6         \n\t"\
296
    ASMALIGN(4)\
297
    "2:                                             \n\t"\
298
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
299
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
300
    "mov                4(%%"REG_d"), %%"REG_S"     \n\t"\
301
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
302
    "movq                      %%mm0, %%mm3         \n\t"\
303
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
304
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
305
    "movq               8(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
306
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
307
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
308
    "paddd                     %%mm0, %%mm1         \n\t"\
309
    "paddd                     %%mm3, %%mm5         \n\t"\
310
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
311
    "mov               16(%%"REG_d"), %%"REG_S"     \n\t"\
312
    "add                         $16, %%"REG_d"     \n\t"\
313
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
314
    "movq                      %%mm2, %%mm0         \n\t"\
315
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
316
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
317
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
318
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
319
    "paddd                     %%mm2, %%mm7         \n\t"\
320
    "paddd                     %%mm0, %%mm6         \n\t"\
321
    " jnz                         2b                \n\t"\
322
    "psrad                       $16, %%mm1         \n\t"\
323
    "psrad                       $16, %%mm5         \n\t"\
324
    "psrad                       $16, %%mm7         \n\t"\
325
    "psrad                       $16, %%mm6         \n\t"\
326
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
327
    "packssdw                  %%mm5, %%mm1         \n\t"\
328
    "packssdw                  %%mm6, %%mm7         \n\t"\
329
    "paddw                     %%mm0, %%mm1         \n\t"\
330
    "paddw                     %%mm0, %%mm7         \n\t"\
331
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
332
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
333

    
334
#define YSCALEYUV2RGBX \
335
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
336
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
337
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
338
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
339
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
340
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
341
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
342
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
343
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
344
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
345
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
346
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
347
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
348
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
349
    "paddw           %%mm3, %%mm4       \n\t"\
350
    "movq            %%mm2, %%mm0       \n\t"\
351
    "movq            %%mm5, %%mm6       \n\t"\
352
    "movq            %%mm4, %%mm3       \n\t"\
353
    "punpcklwd       %%mm2, %%mm2       \n\t"\
354
    "punpcklwd       %%mm5, %%mm5       \n\t"\
355
    "punpcklwd       %%mm4, %%mm4       \n\t"\
356
    "paddw           %%mm1, %%mm2       \n\t"\
357
    "paddw           %%mm1, %%mm5       \n\t"\
358
    "paddw           %%mm1, %%mm4       \n\t"\
359
    "punpckhwd       %%mm0, %%mm0       \n\t"\
360
    "punpckhwd       %%mm6, %%mm6       \n\t"\
361
    "punpckhwd       %%mm3, %%mm3       \n\t"\
362
    "paddw           %%mm7, %%mm0       \n\t"\
363
    "paddw           %%mm7, %%mm6       \n\t"\
364
    "paddw           %%mm7, %%mm3       \n\t"\
365
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
366
    "packuswb        %%mm0, %%mm2       \n\t"\
367
    "packuswb        %%mm6, %%mm5       \n\t"\
368
    "packuswb        %%mm3, %%mm4       \n\t"\
369
    "pxor            %%mm7, %%mm7       \n\t"
370
#if 0
371
#define FULL_YSCALEYUV2RGB \
372
    "pxor                 %%mm7, %%mm7  \n\t"\
373
    "movd                    %6, %%mm6  \n\t" /*yalpha1*/\
374
    "punpcklwd            %%mm6, %%mm6  \n\t"\
375
    "punpcklwd            %%mm6, %%mm6  \n\t"\
376
    "movd                    %7, %%mm5  \n\t" /*uvalpha1*/\
377
    "punpcklwd            %%mm5, %%mm5  \n\t"\
378
    "punpcklwd            %%mm5, %%mm5  \n\t"\
379
    "xor              %%"REG_a", %%"REG_a"  \n\t"\
380
    ASMALIGN(4)\
381
    "1:                                 \n\t"\
382
    "movq     (%0, %%"REG_a",2), %%mm0  \n\t" /*buf0[eax]*/\
383
    "movq     (%1, %%"REG_a",2), %%mm1  \n\t" /*buf1[eax]*/\
384
    "movq     (%2, %%"REG_a",2), %%mm2  \n\t" /* uvbuf0[eax]*/\
385
    "movq     (%3, %%"REG_a",2), %%mm3  \n\t" /* uvbuf1[eax]*/\
386
    "psubw                %%mm1, %%mm0  \n\t" /* buf0[eax] - buf1[eax]*/\
387
    "psubw                %%mm3, %%mm2  \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
388
    "pmulhw               %%mm6, %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
389
    "pmulhw               %%mm5, %%mm2  \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
390
    "psraw                   $4, %%mm1  \n\t" /* buf0[eax] - buf1[eax] >>4*/\
391
    "movq 4096(%2, %%"REG_a",2), %%mm4  \n\t" /* uvbuf0[eax+2048]*/\
392
    "psraw                   $4, %%mm3  \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
393
    "paddw                %%mm0, %%mm1  \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
394
    "movq 4096(%3, %%"REG_a",2), %%mm0  \n\t" /* uvbuf1[eax+2048]*/\
395
    "paddw                %%mm2, %%mm3  \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
396
    "psubw                %%mm0, %%mm4  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
397
    "psubw        "MANGLE(w80)", %%mm1  \n\t" /* 8(Y-16)*/\
398
    "psubw       "MANGLE(w400)", %%mm3  \n\t" /* 8(U-128)*/\
399
    "pmulhw    "MANGLE(yCoeff)", %%mm1  \n\t"\
400
\
401
\
402
    "pmulhw               %%mm5, %%mm4  \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
403
    "movq                 %%mm3, %%mm2  \n\t" /* (U-128)8*/\
404
    "pmulhw   "MANGLE(ubCoeff)", %%mm3  \n\t"\
405
    "psraw                   $4, %%mm0  \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
406
    "pmulhw   "MANGLE(ugCoeff)", %%mm2  \n\t"\
407
    "paddw                %%mm4, %%mm0  \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
408
    "psubw       "MANGLE(w400)", %%mm0  \n\t" /* (V-128)8*/\
409
\
410
\
411
    "movq                 %%mm0, %%mm4  \n\t" /* (V-128)8*/\
412
    "pmulhw   "MANGLE(vrCoeff)", %%mm0  \n\t"\
413
    "pmulhw   "MANGLE(vgCoeff)", %%mm4  \n\t"\
414
    "paddw                %%mm1, %%mm3  \n\t" /* B*/\
415
    "paddw                %%mm1, %%mm0  \n\t" /* R*/\
416
    "packuswb             %%mm3, %%mm3  \n\t"\
417
\
418
    "packuswb             %%mm0, %%mm0  \n\t"\
419
    "paddw                %%mm4, %%mm2  \n\t"\
420
    "paddw                %%mm2, %%mm1  \n\t" /* G*/\
421
\
422
    "packuswb             %%mm1, %%mm1  \n\t"
423
#endif
424

    
425
#define REAL_YSCALEYUV2PACKED(index, c) \
426
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
427
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
428
    "psraw                $3, %%mm0                           \n\t"\
429
    "psraw                $3, %%mm1                           \n\t"\
430
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
431
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
432
    "xor            "#index", "#index"                        \n\t"\
433
    ASMALIGN(4)\
434
    "1:                                 \n\t"\
435
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
436
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
437
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
438
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
439
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
440
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
441
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
442
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
443
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
444
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
445
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
446
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
447
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
448
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
449
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
450
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
451
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
452
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
453
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
454
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
455
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
456
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
457
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
458
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
459
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
460

    
461
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
462

    
463
#define REAL_YSCALEYUV2RGB(index, c) \
464
    "xor            "#index", "#index"  \n\t"\
465
    ASMALIGN(4)\
466
    "1:                                 \n\t"\
467
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
468
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
469
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
470
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
471
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
472
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
473
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
474
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
475
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
476
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
477
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
478
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
479
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
480
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
481
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
482
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
483
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
484
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
485
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
486
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
487
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
488
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
489
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
490
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
491
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
492
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
493
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
494
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
495
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
496
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
497
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
498
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
499
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
500
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
501
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
502
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
503
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
504
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
505
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
506
    "paddw             %%mm3, %%mm4     \n\t"\
507
    "movq              %%mm2, %%mm0     \n\t"\
508
    "movq              %%mm5, %%mm6     \n\t"\
509
    "movq              %%mm4, %%mm3     \n\t"\
510
    "punpcklwd         %%mm2, %%mm2     \n\t"\
511
    "punpcklwd         %%mm5, %%mm5     \n\t"\
512
    "punpcklwd         %%mm4, %%mm4     \n\t"\
513
    "paddw             %%mm1, %%mm2     \n\t"\
514
    "paddw             %%mm1, %%mm5     \n\t"\
515
    "paddw             %%mm1, %%mm4     \n\t"\
516
    "punpckhwd         %%mm0, %%mm0     \n\t"\
517
    "punpckhwd         %%mm6, %%mm6     \n\t"\
518
    "punpckhwd         %%mm3, %%mm3     \n\t"\
519
    "paddw             %%mm7, %%mm0     \n\t"\
520
    "paddw             %%mm7, %%mm6     \n\t"\
521
    "paddw             %%mm7, %%mm3     \n\t"\
522
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
523
    "packuswb          %%mm0, %%mm2     \n\t"\
524
    "packuswb          %%mm6, %%mm5     \n\t"\
525
    "packuswb          %%mm3, %%mm4     \n\t"\
526
    "pxor              %%mm7, %%mm7     \n\t"
527
#define YSCALEYUV2RGB(index, c)  REAL_YSCALEYUV2RGB(index, c)
528

    
529
#define REAL_YSCALEYUV2PACKED1(index, c) \
530
    "xor            "#index", "#index"  \n\t"\
531
    ASMALIGN(4)\
532
    "1:                                 \n\t"\
533
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
534
    "movq 4096(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
535
    "psraw                $7, %%mm3     \n\t" \
536
    "psraw                $7, %%mm4     \n\t" \
537
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
538
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
539
    "psraw                $7, %%mm1     \n\t" \
540
    "psraw                $7, %%mm7     \n\t" \
541

    
542
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
543

    
544
#define REAL_YSCALEYUV2RGB1(index, c) \
545
    "xor            "#index", "#index"  \n\t"\
546
    ASMALIGN(4)\
547
    "1:                                 \n\t"\
548
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
549
    "movq 4096(%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
550
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
551
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
552
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
553
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
554
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
555
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
556
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
557
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
558
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
559
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
560
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
561
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
562
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
563
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
564
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
565
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
566
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
567
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
568
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
569
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
570
    "paddw             %%mm3, %%mm4     \n\t"\
571
    "movq              %%mm2, %%mm0     \n\t"\
572
    "movq              %%mm5, %%mm6     \n\t"\
573
    "movq              %%mm4, %%mm3     \n\t"\
574
    "punpcklwd         %%mm2, %%mm2     \n\t"\
575
    "punpcklwd         %%mm5, %%mm5     \n\t"\
576
    "punpcklwd         %%mm4, %%mm4     \n\t"\
577
    "paddw             %%mm1, %%mm2     \n\t"\
578
    "paddw             %%mm1, %%mm5     \n\t"\
579
    "paddw             %%mm1, %%mm4     \n\t"\
580
    "punpckhwd         %%mm0, %%mm0     \n\t"\
581
    "punpckhwd         %%mm6, %%mm6     \n\t"\
582
    "punpckhwd         %%mm3, %%mm3     \n\t"\
583
    "paddw             %%mm7, %%mm0     \n\t"\
584
    "paddw             %%mm7, %%mm6     \n\t"\
585
    "paddw             %%mm7, %%mm3     \n\t"\
586
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
587
    "packuswb          %%mm0, %%mm2     \n\t"\
588
    "packuswb          %%mm6, %%mm5     \n\t"\
589
    "packuswb          %%mm3, %%mm4     \n\t"\
590
    "pxor              %%mm7, %%mm7     \n\t"
591
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
592

    
593
#define REAL_YSCALEYUV2PACKED1b(index, c) \
594
    "xor "#index", "#index"             \n\t"\
595
    ASMALIGN(4)\
596
    "1:                                 \n\t"\
597
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
598
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
599
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
600
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
601
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
602
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
603
    "psrlw                $8, %%mm3     \n\t" \
604
    "psrlw                $8, %%mm4     \n\t" \
605
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
606
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
607
    "psraw                $7, %%mm1     \n\t" \
608
    "psraw                $7, %%mm7     \n\t"
609
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
610

    
611
// do vertical chrominance interpolation
612
#define REAL_YSCALEYUV2RGB1b(index, c) \
613
    "xor            "#index", "#index"  \n\t"\
614
    ASMALIGN(4)\
615
    "1:                                 \n\t"\
616
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
617
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
618
    "movq 4096(%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
619
    "movq 4096(%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
620
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
621
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
622
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
623
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
624
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
625
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
626
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
627
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
628
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
629
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
630
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
631
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
632
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
633
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
634
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
635
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
636
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
637
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
638
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
639
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
640
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
641
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
642
    "paddw             %%mm3, %%mm4     \n\t"\
643
    "movq              %%mm2, %%mm0     \n\t"\
644
    "movq              %%mm5, %%mm6     \n\t"\
645
    "movq              %%mm4, %%mm3     \n\t"\
646
    "punpcklwd         %%mm2, %%mm2     \n\t"\
647
    "punpcklwd         %%mm5, %%mm5     \n\t"\
648
    "punpcklwd         %%mm4, %%mm4     \n\t"\
649
    "paddw             %%mm1, %%mm2     \n\t"\
650
    "paddw             %%mm1, %%mm5     \n\t"\
651
    "paddw             %%mm1, %%mm4     \n\t"\
652
    "punpckhwd         %%mm0, %%mm0     \n\t"\
653
    "punpckhwd         %%mm6, %%mm6     \n\t"\
654
    "punpckhwd         %%mm3, %%mm3     \n\t"\
655
    "paddw             %%mm7, %%mm0     \n\t"\
656
    "paddw             %%mm7, %%mm6     \n\t"\
657
    "paddw             %%mm7, %%mm3     \n\t"\
658
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
659
    "packuswb          %%mm0, %%mm2     \n\t"\
660
    "packuswb          %%mm6, %%mm5     \n\t"\
661
    "packuswb          %%mm3, %%mm4     \n\t"\
662
    "pxor              %%mm7, %%mm7     \n\t"
663
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
664

    
665
#define REAL_WRITEBGR32(dst, dstw, index) \
666
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
667
    "movq      %%mm2, %%mm1     \n\t" /* B */\
668
    "movq      %%mm5, %%mm6     \n\t" /* R */\
669
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
670
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
671
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
672
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
673
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
674
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
675
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
676
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
677
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
678
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
679
\
680
    MOVNTQ(%%mm0,   (dst, index, 4))\
681
    MOVNTQ(%%mm2,  8(dst, index, 4))\
682
    MOVNTQ(%%mm1, 16(dst, index, 4))\
683
    MOVNTQ(%%mm3, 24(dst, index, 4))\
684
\
685
    "add      $8, "#index"      \n\t"\
686
    "cmp "#dstw", "#index"      \n\t"\
687
    " jb      1b                \n\t"
688
#define WRITEBGR32(dst, dstw, index)  REAL_WRITEBGR32(dst, dstw, index)
689

    
690
#define REAL_WRITEBGR16(dst, dstw, index) \
691
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
692
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
693
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
694
    "psrlq           $3, %%mm2  \n\t"\
695
\
696
    "movq         %%mm2, %%mm1  \n\t"\
697
    "movq         %%mm4, %%mm3  \n\t"\
698
\
699
    "punpcklbw    %%mm7, %%mm3  \n\t"\
700
    "punpcklbw    %%mm5, %%mm2  \n\t"\
701
    "punpckhbw    %%mm7, %%mm4  \n\t"\
702
    "punpckhbw    %%mm5, %%mm1  \n\t"\
703
\
704
    "psllq           $3, %%mm3  \n\t"\
705
    "psllq           $3, %%mm4  \n\t"\
706
\
707
    "por          %%mm3, %%mm2  \n\t"\
708
    "por          %%mm4, %%mm1  \n\t"\
709
\
710
    MOVNTQ(%%mm2,  (dst, index, 2))\
711
    MOVNTQ(%%mm1, 8(dst, index, 2))\
712
\
713
    "add             $8, "#index"   \n\t"\
714
    "cmp        "#dstw", "#index"   \n\t"\
715
    " jb             1b             \n\t"
716
#define WRITEBGR16(dst, dstw, index)  REAL_WRITEBGR16(dst, dstw, index)
717

    
718
#define REAL_WRITEBGR15(dst, dstw, index) \
719
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
720
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
721
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
722
    "psrlq           $3, %%mm2  \n\t"\
723
    "psrlq           $1, %%mm5  \n\t"\
724
\
725
    "movq         %%mm2, %%mm1  \n\t"\
726
    "movq         %%mm4, %%mm3  \n\t"\
727
\
728
    "punpcklbw    %%mm7, %%mm3  \n\t"\
729
    "punpcklbw    %%mm5, %%mm2  \n\t"\
730
    "punpckhbw    %%mm7, %%mm4  \n\t"\
731
    "punpckhbw    %%mm5, %%mm1  \n\t"\
732
\
733
    "psllq           $2, %%mm3  \n\t"\
734
    "psllq           $2, %%mm4  \n\t"\
735
\
736
    "por          %%mm3, %%mm2  \n\t"\
737
    "por          %%mm4, %%mm1  \n\t"\
738
\
739
    MOVNTQ(%%mm2,  (dst, index, 2))\
740
    MOVNTQ(%%mm1, 8(dst, index, 2))\
741
\
742
    "add             $8, "#index"   \n\t"\
743
    "cmp        "#dstw", "#index"   \n\t"\
744
    " jb             1b             \n\t"
745
#define WRITEBGR15(dst, dstw, index)  REAL_WRITEBGR15(dst, dstw, index)
746

    
747
#define WRITEBGR24OLD(dst, dstw, index) \
748
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
749
    "movq      %%mm2, %%mm1             \n\t" /* B */\
750
    "movq      %%mm5, %%mm6             \n\t" /* R */\
751
    "punpcklbw %%mm4, %%mm2             \n\t" /* GBGBGBGB 0 */\
752
    "punpcklbw %%mm7, %%mm5             \n\t" /* 0R0R0R0R 0 */\
753
    "punpckhbw %%mm4, %%mm1             \n\t" /* GBGBGBGB 2 */\
754
    "punpckhbw %%mm7, %%mm6             \n\t" /* 0R0R0R0R 2 */\
755
    "movq      %%mm2, %%mm0             \n\t" /* GBGBGBGB 0 */\
756
    "movq      %%mm1, %%mm3             \n\t" /* GBGBGBGB 2 */\
757
    "punpcklwd %%mm5, %%mm0             \n\t" /* 0RGB0RGB 0 */\
758
    "punpckhwd %%mm5, %%mm2             \n\t" /* 0RGB0RGB 1 */\
759
    "punpcklwd %%mm6, %%mm1             \n\t" /* 0RGB0RGB 2 */\
760
    "punpckhwd %%mm6, %%mm3             \n\t" /* 0RGB0RGB 3 */\
761
\
762
    "movq      %%mm0, %%mm4             \n\t" /* 0RGB0RGB 0 */\
763
    "psrlq        $8, %%mm0             \n\t" /* 00RGB0RG 0 */\
764
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 0 */\
765
    "pand "MANGLE(bm11111000)", %%mm0   \n\t" /* 00RGB000 0.5 */\
766
    "por       %%mm4, %%mm0             \n\t" /* 00RGBRGB 0 */\
767
    "movq      %%mm2, %%mm4             \n\t" /* 0RGB0RGB 1 */\
768
    "psllq       $48, %%mm2             \n\t" /* GB000000 1 */\
769
    "por       %%mm2, %%mm0             \n\t" /* GBRGBRGB 0 */\
770
\
771
    "movq      %%mm4, %%mm2             \n\t" /* 0RGB0RGB 1 */\
772
    "psrld       $16, %%mm4             \n\t" /* 000R000R 1 */\
773
    "psrlq       $24, %%mm2             \n\t" /* 0000RGB0 1.5 */\
774
    "por       %%mm4, %%mm2             \n\t" /* 000RRGBR 1 */\
775
    "pand "MANGLE(bm00001111)", %%mm2   \n\t" /* 0000RGBR 1 */\
776
    "movq      %%mm1, %%mm4             \n\t" /* 0RGB0RGB 2 */\
777
    "psrlq        $8, %%mm1             \n\t" /* 00RGB0RG 2 */\
778
    "pand "MANGLE(bm00000111)", %%mm4   \n\t" /* 00000RGB 2 */\
779
    "pand "MANGLE(bm11111000)", %%mm1   \n\t" /* 00RGB000 2.5 */\
780
    "por       %%mm4, %%mm1             \n\t" /* 00RGBRGB 2 */\
781
    "movq      %%mm1, %%mm4             \n\t" /* 00RGBRGB 2 */\
782
    "psllq       $32, %%mm1             \n\t" /* BRGB0000 2 */\
783
    "por       %%mm1, %%mm2             \n\t" /* BRGBRGBR 1 */\
784
\
785
    "psrlq       $32, %%mm4             \n\t" /* 000000RG 2.5 */\
786
    "movq      %%mm3, %%mm5             \n\t" /* 0RGB0RGB 3 */\
787
    "psrlq        $8, %%mm3             \n\t" /* 00RGB0RG 3 */\
788
    "pand "MANGLE(bm00000111)", %%mm5   \n\t" /* 00000RGB 3 */\
789
    "pand "MANGLE(bm11111000)", %%mm3   \n\t" /* 00RGB000 3.5 */\
790
    "por       %%mm5, %%mm3             \n\t" /* 00RGBRGB 3 */\
791
    "psllq       $16, %%mm3             \n\t" /* RGBRGB00 3 */\
792
    "por       %%mm4, %%mm3             \n\t" /* RGBRGBRG 2.5 */\
793
\
794
    MOVNTQ(%%mm0,   (dst))\
795
    MOVNTQ(%%mm2,  8(dst))\
796
    MOVNTQ(%%mm3, 16(dst))\
797
    "add         $24, "#dst"            \n\t"\
798
\
799
    "add          $8, "#index"          \n\t"\
800
    "cmp     "#dstw", "#index"          \n\t"\
801
    " jb          1b                    \n\t"
802

    
803
#define WRITEBGR24MMX(dst, dstw, index) \
804
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
805
    "movq      %%mm2, %%mm1     \n\t" /* B */\
806
    "movq      %%mm5, %%mm6     \n\t" /* R */\
807
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
808
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
809
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
810
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
811
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
812
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
813
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
814
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
815
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
816
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
817
\
818
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
819
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
820
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
821
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
822
\
823
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
824
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
825
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
826
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
827
\
828
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
829
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
830
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
831
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
832
\
833
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
834
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
835
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
836
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
837
    MOVNTQ(%%mm0, (dst))\
838
\
839
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
840
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
841
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
842
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
843
    MOVNTQ(%%mm6, 8(dst))\
844
\
845
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
846
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
847
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
848
    MOVNTQ(%%mm5, 16(dst))\
849
\
850
    "add         $24, "#dst"    \n\t"\
851
\
852
    "add          $8, "#index"  \n\t"\
853
    "cmp     "#dstw", "#index"  \n\t"\
854
    " jb          1b            \n\t"
855

    
856
#define WRITEBGR24MMX2(dst, dstw, index) \
857
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
858
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
859
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
860
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
861
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
862
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
863
\
864
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
865
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
866
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
867
\
868
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
869
    "por    %%mm1, %%mm6        \n\t"\
870
    "por    %%mm3, %%mm6        \n\t"\
871
    MOVNTQ(%%mm6, (dst))\
872
\
873
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
874
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
875
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
876
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
877
\
878
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
879
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
880
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
881
\
882
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
883
    "por    %%mm3, %%mm6        \n\t"\
884
    MOVNTQ(%%mm6, 8(dst))\
885
\
886
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
887
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
888
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
889
\
890
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
891
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
892
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
893
\
894
    "por    %%mm1, %%mm3        \n\t"\
895
    "por    %%mm3, %%mm6        \n\t"\
896
    MOVNTQ(%%mm6, 16(dst))\
897
\
898
    "add      $24, "#dst"       \n\t"\
899
\
900
    "add       $8, "#index"     \n\t"\
901
    "cmp  "#dstw", "#index"     \n\t"\
902
    " jb       1b               \n\t"
903

    
904
#ifdef HAVE_MMX2
905
#undef WRITEBGR24
906
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX2(dst, dstw, index)
907
#else
908
#undef WRITEBGR24
909
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
910
#endif
911

    
912
#define REAL_WRITEYUY2(dst, dstw, index) \
913
    "packuswb  %%mm3, %%mm3     \n\t"\
914
    "packuswb  %%mm4, %%mm4     \n\t"\
915
    "packuswb  %%mm7, %%mm1     \n\t"\
916
    "punpcklbw %%mm4, %%mm3     \n\t"\
917
    "movq      %%mm1, %%mm7     \n\t"\
918
    "punpcklbw %%mm3, %%mm1     \n\t"\
919
    "punpckhbw %%mm3, %%mm7     \n\t"\
920
\
921
    MOVNTQ(%%mm1, (dst, index, 2))\
922
    MOVNTQ(%%mm7, 8(dst, index, 2))\
923
\
924
    "add          $8, "#index"  \n\t"\
925
    "cmp     "#dstw", "#index"  \n\t"\
926
    " jb          1b            \n\t"
927
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
928

    
929

    
930
static inline void RENAME(yuv2yuvX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
931
                                    int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
932
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
933
{
934
#ifdef HAVE_MMX
935
    if (c->flags & SWS_ACCURATE_RND){
936
        if (uDest){
937
            YSCALEYUV2YV12X_ACCURATE(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
938
            YSCALEYUV2YV12X_ACCURATE(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
939
        }
940

    
941
        YSCALEYUV2YV12X_ACCURATE(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
942
    }else{
943
        if (uDest){
944
            YSCALEYUV2YV12X(   0, CHR_MMX_FILTER_OFFSET, uDest, chrDstW)
945
            YSCALEYUV2YV12X(4096, CHR_MMX_FILTER_OFFSET, vDest, chrDstW)
946
        }
947

    
948
        YSCALEYUV2YV12X(0, LUM_MMX_FILTER_OFFSET, dest, dstW)
949
    }
950
#else
951
#ifdef HAVE_ALTIVEC
952
yuv2yuvX_altivec_real(lumFilter, lumSrc, lumFilterSize,
953
                      chrFilter, chrSrc, chrFilterSize,
954
                      dest, uDest, vDest, dstW, chrDstW);
955
#else //HAVE_ALTIVEC
956
yuv2yuvXinC(lumFilter, lumSrc, lumFilterSize,
957
            chrFilter, chrSrc, chrFilterSize,
958
            dest, uDest, vDest, dstW, chrDstW);
959
#endif //!HAVE_ALTIVEC
960
#endif /* HAVE_MMX */
961
}
962

    
963
static inline void RENAME(yuv2nv12X)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
964
                                     int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
965
                                     uint8_t *dest, uint8_t *uDest, int dstW, int chrDstW, int dstFormat)
966
{
967
yuv2nv12XinC(lumFilter, lumSrc, lumFilterSize,
968
             chrFilter, chrSrc, chrFilterSize,
969
             dest, uDest, dstW, chrDstW, dstFormat);
970
}
971

    
972
static inline void RENAME(yuv2yuv1)(int16_t *lumSrc, int16_t *chrSrc,
973
                                    uint8_t *dest, uint8_t *uDest, uint8_t *vDest, long dstW, long chrDstW)
974
{
975
#ifdef HAVE_MMX
976
    if (uDest)
977
    {
978
        asm volatile(
979
            YSCALEYUV2YV121
980
            :: "r" (chrSrc + chrDstW), "r" (uDest + chrDstW),
981
            "g" (-chrDstW)
982
            : "%"REG_a
983
        );
984

    
985
        asm volatile(
986
            YSCALEYUV2YV121
987
            :: "r" (chrSrc + 2048 + chrDstW), "r" (vDest + chrDstW),
988
            "g" (-chrDstW)
989
            : "%"REG_a
990
        );
991
    }
992

    
993
    asm volatile(
994
        YSCALEYUV2YV121
995
        :: "r" (lumSrc + dstW), "r" (dest + dstW),
996
        "g" (-dstW)
997
        : "%"REG_a
998
    );
999
#else
1000
    int i;
1001
    for (i=0; i<dstW; i++)
1002
    {
1003
        int val= lumSrc[i]>>7;
1004

    
1005
        if (val&256){
1006
            if (val<0) val=0;
1007
            else       val=255;
1008
        }
1009

    
1010
        dest[i]= val;
1011
    }
1012

    
1013
    if (uDest)
1014
        for (i=0; i<chrDstW; i++)
1015
        {
1016
            int u=chrSrc[i]>>7;
1017
            int v=chrSrc[i + 2048]>>7;
1018

    
1019
            if ((u|v)&256){
1020
                if (u<0)        u=0;
1021
                else if (u>255) u=255;
1022
                if (v<0)        v=0;
1023
                else if (v>255) v=255;
1024
            }
1025

    
1026
            uDest[i]= u;
1027
            vDest[i]= v;
1028
        }
1029
#endif
1030
}
1031

    
1032

    
1033
/**
1034
 * vertical scale YV12 to RGB
1035
 */
1036
static inline void RENAME(yuv2packedX)(SwsContext *c, int16_t *lumFilter, int16_t **lumSrc, int lumFilterSize,
1037
                                       int16_t *chrFilter, int16_t **chrSrc, int chrFilterSize,
1038
                                       uint8_t *dest, long dstW, long dstY)
1039
{
1040
#ifdef HAVE_MMX
1041
    long dummy=0;
1042
    if (c->flags & SWS_ACCURATE_RND){
1043
        switch(c->dstFormat){
1044
        case PIX_FMT_RGB32:
1045
            YSCALEYUV2PACKEDX_ACCURATE
1046
            YSCALEYUV2RGBX
1047
            WRITEBGR32(%4, %5, %%REGa)
1048

    
1049
            YSCALEYUV2PACKEDX_END
1050
            return;
1051
        case PIX_FMT_BGR24:
1052
            YSCALEYUV2PACKEDX_ACCURATE
1053
            YSCALEYUV2RGBX
1054
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
1055
            "add %4, %%"REG_c"                        \n\t"
1056
            WRITEBGR24(%%REGc, %5, %%REGa)
1057

    
1058

    
1059
            :: "r" (&c->redDither),
1060
               "m" (dummy), "m" (dummy), "m" (dummy),
1061
               "r" (dest), "m" (dstW)
1062
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1063
            );
1064
            return;
1065
        case PIX_FMT_BGR555:
1066
            YSCALEYUV2PACKEDX_ACCURATE
1067
            YSCALEYUV2RGBX
1068
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1069
#ifdef DITHER1XBPP
1070
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1071
            "paddusb "MANGLE(g5Dither)", %%mm4\n\t"
1072
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1073
#endif
1074

    
1075
            WRITEBGR15(%4, %5, %%REGa)
1076
            YSCALEYUV2PACKEDX_END
1077
            return;
1078
        case PIX_FMT_BGR565:
1079
            YSCALEYUV2PACKEDX_ACCURATE
1080
            YSCALEYUV2RGBX
1081
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1082
#ifdef DITHER1XBPP
1083
            "paddusb "MANGLE(b5Dither)", %%mm2\n\t"
1084
            "paddusb "MANGLE(g6Dither)", %%mm4\n\t"
1085
            "paddusb "MANGLE(r5Dither)", %%mm5\n\t"
1086
#endif
1087

    
1088
            WRITEBGR16(%4, %5, %%REGa)
1089
            YSCALEYUV2PACKEDX_END
1090
            return;
1091
        case PIX_FMT_YUYV422:
1092
            YSCALEYUV2PACKEDX_ACCURATE
1093
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1094

    
1095
            "psraw $3, %%mm3    \n\t"
1096
            "psraw $3, %%mm4    \n\t"
1097
            "psraw $3, %%mm1    \n\t"
1098
            "psraw $3, %%mm7    \n\t"
1099
            WRITEYUY2(%4, %5, %%REGa)
1100
            YSCALEYUV2PACKEDX_END
1101
            return;
1102
    }
1103
    }else{
1104
        switch(c->dstFormat)
1105
        {
1106
        case PIX_FMT_RGB32:
1107
            YSCALEYUV2PACKEDX
1108
            YSCALEYUV2RGBX
1109
            WRITEBGR32(%4, %5, %%REGa)
1110
            YSCALEYUV2PACKEDX_END
1111
            return;
1112
        case PIX_FMT_BGR24:
1113
            YSCALEYUV2PACKEDX
1114
            YSCALEYUV2RGBX
1115
            "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
1116
            "add                        %4, %%"REG_c"   \n\t"
1117
            WRITEBGR24(%%REGc, %5, %%REGa)
1118

    
1119
            :: "r" (&c->redDither),
1120
               "m" (dummy), "m" (dummy), "m" (dummy),
1121
               "r" (dest),  "m" (dstW)
1122
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
1123
            );
1124
            return;
1125
        case PIX_FMT_BGR555:
1126
            YSCALEYUV2PACKEDX
1127
            YSCALEYUV2RGBX
1128
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1129
#ifdef DITHER1XBPP
1130
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1131
            "paddusb "MANGLE(g5Dither)", %%mm4  \n\t"
1132
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1133
#endif
1134

    
1135
            WRITEBGR15(%4, %5, %%REGa)
1136
            YSCALEYUV2PACKEDX_END
1137
            return;
1138
        case PIX_FMT_BGR565:
1139
            YSCALEYUV2PACKEDX
1140
            YSCALEYUV2RGBX
1141
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1142
#ifdef DITHER1XBPP
1143
            "paddusb "MANGLE(b5Dither)", %%mm2  \n\t"
1144
            "paddusb "MANGLE(g6Dither)", %%mm4  \n\t"
1145
            "paddusb "MANGLE(r5Dither)", %%mm5  \n\t"
1146
#endif
1147

    
1148
            WRITEBGR16(%4, %5, %%REGa)
1149
            YSCALEYUV2PACKEDX_END
1150
            return;
1151
        case PIX_FMT_YUYV422:
1152
            YSCALEYUV2PACKEDX
1153
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1154

    
1155
            "psraw $3, %%mm3    \n\t"
1156
            "psraw $3, %%mm4    \n\t"
1157
            "psraw $3, %%mm1    \n\t"
1158
            "psraw $3, %%mm7    \n\t"
1159
            WRITEYUY2(%4, %5, %%REGa)
1160
            YSCALEYUV2PACKEDX_END
1161
            return;
1162
        }
1163
    }
1164
#endif /* HAVE_MMX */
1165
#ifdef HAVE_ALTIVEC
1166
    /* The following list of supported dstFormat values should
1167
       match what's found in the body of altivec_yuv2packedX() */
1168
    if (c->dstFormat==PIX_FMT_ABGR  || c->dstFormat==PIX_FMT_BGRA  ||
1169
        c->dstFormat==PIX_FMT_BGR24 || c->dstFormat==PIX_FMT_RGB24 ||
1170
        c->dstFormat==PIX_FMT_RGBA  || c->dstFormat==PIX_FMT_ARGB)
1171
            altivec_yuv2packedX (c, lumFilter, lumSrc, lumFilterSize,
1172
                                 chrFilter, chrSrc, chrFilterSize,
1173
                                 dest, dstW, dstY);
1174
    else
1175
#endif
1176
        yuv2packedXinC(c, lumFilter, lumSrc, lumFilterSize,
1177
                       chrFilter, chrSrc, chrFilterSize,
1178
                       dest, dstW, dstY);
1179
}
1180

    
1181
/**
1182
 * vertical bilinear scale YV12 to RGB
1183
 */
1184
static inline void RENAME(yuv2packed2)(SwsContext *c, uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, uint16_t *uvbuf1,
1185
                          uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
1186
{
1187
    int yalpha1=yalpha^4095;
1188
    int uvalpha1=uvalpha^4095;
1189
    int i;
1190

    
1191
#if 0 //isn't used
1192
    if (flags&SWS_FULL_CHR_H_INT)
1193
    {
1194
        switch(dstFormat)
1195
        {
1196
#ifdef HAVE_MMX
1197
        case PIX_FMT_RGB32:
1198
            asm volatile(
1199

1200

1201
FULL_YSCALEYUV2RGB
1202
            "punpcklbw %%mm1, %%mm3    \n\t" // BGBGBGBG
1203
            "punpcklbw %%mm7, %%mm0    \n\t" // R0R0R0R0
1204

1205
            "movq      %%mm3, %%mm1    \n\t"
1206
            "punpcklwd %%mm0, %%mm3    \n\t" // BGR0BGR0
1207
            "punpckhwd %%mm0, %%mm1    \n\t" // BGR0BGR0
1208

1209
            MOVNTQ(%%mm3,  (%4, %%REGa, 4))
1210
            MOVNTQ(%%mm1, 8(%4, %%REGa, 4))
1211

1212
            "add $4, %%"REG_a"  \n\t"
1213
            "cmp %5, %%"REG_a"  \n\t"
1214
            " jb 1b             \n\t"
1215

1216
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" ((long)dstW),
1217
            "m" (yalpha1), "m" (uvalpha1)
1218
            : "%"REG_a
1219
            );
1220
            break;
1221
        case PIX_FMT_BGR24:
1222
            asm volatile(
1223

1224
FULL_YSCALEYUV2RGB
1225

1226
                                              // lsb ... msb
1227
            "punpcklbw %%mm1, %%mm3     \n\t" // BGBGBGBG
1228
            "punpcklbw %%mm7, %%mm0     \n\t" // R0R0R0R0
1229

1230
            "movq      %%mm3, %%mm1     \n\t"
1231
            "punpcklwd %%mm0, %%mm3     \n\t" // BGR0BGR0
1232
            "punpckhwd %%mm0, %%mm1     \n\t" // BGR0BGR0
1233

1234
            "movq      %%mm3, %%mm2     \n\t" // BGR0BGR0
1235
            "psrlq        $8, %%mm3     \n\t" // GR0BGR00
1236
            "pand "MANGLE(bm00000111)", %%mm2   \n\t" // BGR00000
1237
            "pand "MANGLE(bm11111000)", %%mm3   \n\t" // 000BGR00
1238
            "por       %%mm2, %%mm3     \n\t" // BGRBGR00
1239
            "movq      %%mm1, %%mm2     \n\t"
1240
            "psllq       $48, %%mm1     \n\t" // 000000BG
1241
            "por       %%mm1, %%mm3     \n\t" // BGRBGRBG
1242

1243
            "movq      %%mm2, %%mm1     \n\t" // BGR0BGR0
1244
            "psrld       $16, %%mm2     \n\t" // R000R000
1245
            "psrlq       $24, %%mm1     \n\t" // 0BGR0000
1246
            "por       %%mm2, %%mm1     \n\t" // RBGRR000
1247

1248
            "mov          %4, %%"REG_b" \n\t"
1249
            "add   %%"REG_a", %%"REG_b" \n\t"
1250

1251
#ifdef HAVE_MMX2
1252
            //FIXME Alignment
1253
            "movntq %%mm3,  (%%"REG_b", %%"REG_a", 2)   \n\t"
1254
            "movntq %%mm1, 8(%%"REG_b", %%"REG_a", 2)   \n\t"
1255
#else
1256
            "movd %%mm3,  (%%"REG_b", %%"REG_a", 2)     \n\t"
1257
            "psrlq  $32, %%mm3                          \n\t"
1258
            "movd %%mm3, 4(%%"REG_b", %%"REG_a", 2)     \n\t"
1259
            "movd %%mm1, 8(%%"REG_b", %%"REG_a", 2)     \n\t"
1260
#endif
1261
            "add     $4, %%"REG_a"                      \n\t"
1262
            "cmp     %5, %%"REG_a"                      \n\t"
1263
            " jb     1b                                 \n\t"
1264

    
1265
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "m" (dest), "m" (dstW),
1266
            "m" (yalpha1), "m" (uvalpha1)
1267
            : "%"REG_a, "%"REG_b
1268
            );
1269
            break;
1270
        case PIX_FMT_BGR555:
1271
            asm volatile(
1272

    
1273
FULL_YSCALEYUV2RGB
1274
#ifdef DITHER1XBPP
1275
            "paddusb "MANGLE(g5Dither)", %%mm1  \n\t"
1276
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1277
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1278
#endif
1279
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1280
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1281
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1282

    
1283
            "psrlw                   $3, %%mm3  \n\t"
1284
            "psllw                   $2, %%mm1  \n\t"
1285
            "psllw                   $7, %%mm0  \n\t"
1286
            "pand     "MANGLE(g15Mask)", %%mm1  \n\t"
1287
            "pand     "MANGLE(r15Mask)", %%mm0  \n\t"
1288

    
1289
            "por                  %%mm3, %%mm1  \n\t"
1290
            "por                  %%mm1, %%mm0  \n\t"
1291

    
1292
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1293

    
1294
            "add $4, %%"REG_a"  \n\t"
1295
            "cmp %5, %%"REG_a"  \n\t"
1296
            " jb 1b             \n\t"
1297

    
1298
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1299
            "m" (yalpha1), "m" (uvalpha1)
1300
            : "%"REG_a
1301
            );
1302
            break;
1303
        case PIX_FMT_BGR565:
1304
            asm volatile(
1305

    
1306
FULL_YSCALEYUV2RGB
1307
#ifdef DITHER1XBPP
1308
            "paddusb "MANGLE(g6Dither)", %%mm1  \n\t"
1309
            "paddusb "MANGLE(r5Dither)", %%mm0  \n\t"
1310
            "paddusb "MANGLE(b5Dither)", %%mm3  \n\t"
1311
#endif
1312
            "punpcklbw            %%mm7, %%mm1  \n\t" // 0G0G0G0G
1313
            "punpcklbw            %%mm7, %%mm3  \n\t" // 0B0B0B0B
1314
            "punpcklbw            %%mm7, %%mm0  \n\t" // 0R0R0R0R
1315

    
1316
            "psrlw                   $3, %%mm3  \n\t"
1317
            "psllw                   $3, %%mm1  \n\t"
1318
            "psllw                   $8, %%mm0  \n\t"
1319
            "pand     "MANGLE(g16Mask)", %%mm1  \n\t"
1320
            "pand     "MANGLE(r16Mask)", %%mm0  \n\t"
1321

    
1322
            "por                  %%mm3, %%mm1  \n\t"
1323
            "por                  %%mm1, %%mm0  \n\t"
1324

    
1325
            MOVNTQ(%%mm0, (%4, %%REGa, 2))
1326

    
1327
            "add $4, %%"REG_a"  \n\t"
1328
            "cmp %5, %%"REG_a"  \n\t"
1329
            " jb 1b             \n\t"
1330

    
1331
            :: "r" (buf0), "r" (buf1), "r" (uvbuf0), "r" (uvbuf1), "r" (dest), "m" (dstW),
1332
            "m" (yalpha1), "m" (uvalpha1)
1333
            : "%"REG_a
1334
            );
1335
            break;
1336
#endif /* HAVE_MMX */
1337
        case PIX_FMT_BGR32:
1338
#ifndef HAVE_MMX
1339
        case PIX_FMT_RGB32:
1340
#endif
1341
            if (dstFormat==PIX_FMT_RGB32)
1342
            {
1343
                int i;
1344
#ifdef WORDS_BIGENDIAN
1345
                dest++;
1346
#endif
1347
                for (i=0;i<dstW;i++){
1348
                    // vertical linear interpolation && yuv2rgb in a single step:
1349
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1350
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1351
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1352
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1353
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1354
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1355
                    dest+= 4;
1356
                }
1357
            }
1358
            else if (dstFormat==PIX_FMT_BGR24)
1359
            {
1360
                int i;
1361
                for (i=0;i<dstW;i++){
1362
                    // vertical linear interpolation && yuv2rgb in a single step:
1363
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1364
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1365
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1366
                    dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
1367
                    dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
1368
                    dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
1369
                    dest+= 3;
1370
                }
1371
            }
1372
            else if (dstFormat==PIX_FMT_BGR565)
1373
            {
1374
                int i;
1375
                for (i=0;i<dstW;i++){
1376
                    // vertical linear interpolation && yuv2rgb in a single step:
1377
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1378
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1379
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1380

    
1381
                    ((uint16_t*)dest)[i] =
1382
                        clip_table16b[(Y + yuvtab_40cf[U]) >>13] |
1383
                        clip_table16g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1384
                        clip_table16r[(Y + yuvtab_3343[V]) >>13];
1385
                }
1386
            }
1387
            else if (dstFormat==PIX_FMT_BGR555)
1388
            {
1389
                int i;
1390
                for (i=0;i<dstW;i++){
1391
                    // vertical linear interpolation && yuv2rgb in a single step:
1392
                    int Y=yuvtab_2568[((buf0[i]*yalpha1+buf1[i]*yalpha)>>19)];
1393
                    int U=((uvbuf0[i]*uvalpha1+uvbuf1[i]*uvalpha)>>19);
1394
                    int V=((uvbuf0[i+2048]*uvalpha1+uvbuf1[i+2048]*uvalpha)>>19);
1395

    
1396
                    ((uint16_t*)dest)[i] =
1397
                        clip_table15b[(Y + yuvtab_40cf[U]) >>13] |
1398
                        clip_table15g[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13] |
1399
                        clip_table15r[(Y + yuvtab_3343[V]) >>13];
1400
                }
1401
            }
1402
        }//FULL_UV_IPOL
1403
    else
1404
    {
1405
#endif // if 0
1406
#ifdef HAVE_MMX
1407
        switch(c->dstFormat)
1408
        {
1409
            //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :(
1410
            case PIX_FMT_RGB32:
1411
                asm volatile(
1412
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1413
                "mov        %4, %%"REG_b"               \n\t"
1414
                "push %%"REG_BP"                        \n\t"
1415
                YSCALEYUV2RGB(%%REGBP, %5)
1416
                WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1417
                "pop %%"REG_BP"                         \n\t"
1418
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1419

    
1420
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1421
                "a" (&c->redDither)
1422
                );
1423
                return;
1424
            case PIX_FMT_BGR24:
1425
                asm volatile(
1426
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1427
                "mov        %4, %%"REG_b"               \n\t"
1428
                "push %%"REG_BP"                        \n\t"
1429
                YSCALEYUV2RGB(%%REGBP, %5)
1430
                WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1431
                "pop %%"REG_BP"                         \n\t"
1432
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1433
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1434
                "a" (&c->redDither)
1435
                );
1436
                return;
1437
            case PIX_FMT_BGR555:
1438
                asm volatile(
1439
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1440
                "mov        %4, %%"REG_b"               \n\t"
1441
                "push %%"REG_BP"                        \n\t"
1442
                YSCALEYUV2RGB(%%REGBP, %5)
1443
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1444
#ifdef DITHER1XBPP
1445
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1446
                "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1447
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1448
#endif
1449

    
1450
                WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1451
                "pop %%"REG_BP"                         \n\t"
1452
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1453

    
1454
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1455
                "a" (&c->redDither)
1456
                );
1457
                return;
1458
            case PIX_FMT_BGR565:
1459
                asm volatile(
1460
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1461
                "mov        %4, %%"REG_b"               \n\t"
1462
                "push %%"REG_BP"                        \n\t"
1463
                YSCALEYUV2RGB(%%REGBP, %5)
1464
                /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1465
#ifdef DITHER1XBPP
1466
                "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1467
                "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1468
                "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1469
#endif
1470

    
1471
                WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1472
                "pop %%"REG_BP"                         \n\t"
1473
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1474
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1475
                "a" (&c->redDither)
1476
                );
1477
                return;
1478
            case PIX_FMT_YUYV422:
1479
                asm volatile(
1480
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1481
                "mov %4, %%"REG_b"                        \n\t"
1482
                "push %%"REG_BP"                        \n\t"
1483
                YSCALEYUV2PACKED(%%REGBP, %5)
1484
                WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1485
                "pop %%"REG_BP"                         \n\t"
1486
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1487
                :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1488
                "a" (&c->redDither)
1489
                );
1490
                return;
1491
            default: break;
1492
        }
1493
#endif //HAVE_MMX
1494
YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB2_C, YSCALE_YUV_2_PACKED2_C)
1495
}
1496

    
1497
/**
1498
 * YV12 to RGB without scaling or interpolating
1499
 */
1500
static inline void RENAME(yuv2packed1)(SwsContext *c, uint16_t *buf0, uint16_t *uvbuf0, uint16_t *uvbuf1,
1501
                          uint8_t *dest, int dstW, int uvalpha, int dstFormat, int flags, int y)
1502
{
1503
    const int yalpha1=0;
1504
    int i;
1505

    
1506
    uint16_t *buf1= buf0; //FIXME needed for the rgb1/bgr1
1507
    const int yalpha= 4096; //FIXME ...
1508

    
1509
    if (flags&SWS_FULL_CHR_H_INT)
1510
    {
1511
        RENAME(yuv2packed2)(c, buf0, buf0, uvbuf0, uvbuf1, dest, dstW, 0, uvalpha, y);
1512
        return;
1513
    }
1514

    
1515
#ifdef HAVE_MMX
1516
    if (uvalpha < 2048) // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1517
    {
1518
        switch(dstFormat)
1519
        {
1520
        case PIX_FMT_RGB32:
1521
            asm volatile(
1522
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1523
            "mov        %4, %%"REG_b"               \n\t"
1524
            "push %%"REG_BP"                        \n\t"
1525
            YSCALEYUV2RGB1(%%REGBP, %5)
1526
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1527
            "pop %%"REG_BP"                         \n\t"
1528
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1529

    
1530
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1531
            "a" (&c->redDither)
1532
            );
1533
            return;
1534
        case PIX_FMT_BGR24:
1535
            asm volatile(
1536
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1537
            "mov        %4, %%"REG_b"               \n\t"
1538
            "push %%"REG_BP"                        \n\t"
1539
            YSCALEYUV2RGB1(%%REGBP, %5)
1540
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1541
            "pop %%"REG_BP"                         \n\t"
1542
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1543

    
1544
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1545
            "a" (&c->redDither)
1546
            );
1547
            return;
1548
        case PIX_FMT_BGR555:
1549
            asm volatile(
1550
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1551
            "mov        %4, %%"REG_b"               \n\t"
1552
            "push %%"REG_BP"                        \n\t"
1553
            YSCALEYUV2RGB1(%%REGBP, %5)
1554
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1555
#ifdef DITHER1XBPP
1556
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1557
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1558
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1559
#endif
1560
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1561
            "pop %%"REG_BP"                         \n\t"
1562
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1563

    
1564
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1565
            "a" (&c->redDither)
1566
            );
1567
            return;
1568
        case PIX_FMT_BGR565:
1569
            asm volatile(
1570
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1571
            "mov        %4, %%"REG_b"               \n\t"
1572
            "push %%"REG_BP"                        \n\t"
1573
            YSCALEYUV2RGB1(%%REGBP, %5)
1574
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1575
#ifdef DITHER1XBPP
1576
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1577
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1578
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1579
#endif
1580

    
1581
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1582
            "pop %%"REG_BP"                         \n\t"
1583
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1584

    
1585
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1586
            "a" (&c->redDither)
1587
            );
1588
            return;
1589
        case PIX_FMT_YUYV422:
1590
            asm volatile(
1591
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1592
            "mov        %4, %%"REG_b"               \n\t"
1593
            "push %%"REG_BP"                        \n\t"
1594
            YSCALEYUV2PACKED1(%%REGBP, %5)
1595
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1596
            "pop %%"REG_BP"                         \n\t"
1597
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1598

    
1599
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1600
            "a" (&c->redDither)
1601
            );
1602
            return;
1603
        }
1604
    }
1605
    else
1606
    {
1607
        switch(dstFormat)
1608
        {
1609
        case PIX_FMT_RGB32:
1610
            asm volatile(
1611
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1612
            "mov        %4, %%"REG_b"               \n\t"
1613
            "push %%"REG_BP"                        \n\t"
1614
            YSCALEYUV2RGB1b(%%REGBP, %5)
1615
            WRITEBGR32(%%REGb, 8280(%5), %%REGBP)
1616
            "pop %%"REG_BP"                         \n\t"
1617
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1618

    
1619
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1620
            "a" (&c->redDither)
1621
            );
1622
            return;
1623
        case PIX_FMT_BGR24:
1624
            asm volatile(
1625
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1626
            "mov        %4, %%"REG_b"               \n\t"
1627
            "push %%"REG_BP"                        \n\t"
1628
            YSCALEYUV2RGB1b(%%REGBP, %5)
1629
            WRITEBGR24(%%REGb, 8280(%5), %%REGBP)
1630
            "pop %%"REG_BP"                         \n\t"
1631
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1632

    
1633
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1634
            "a" (&c->redDither)
1635
            );
1636
            return;
1637
        case PIX_FMT_BGR555:
1638
            asm volatile(
1639
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1640
            "mov        %4, %%"REG_b"               \n\t"
1641
            "push %%"REG_BP"                        \n\t"
1642
            YSCALEYUV2RGB1b(%%REGBP, %5)
1643
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1644
#ifdef DITHER1XBPP
1645
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1646
            "paddusb "MANGLE(g5Dither)", %%mm4      \n\t"
1647
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1648
#endif
1649
            WRITEBGR15(%%REGb, 8280(%5), %%REGBP)
1650
            "pop %%"REG_BP"                         \n\t"
1651
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1652

    
1653
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1654
            "a" (&c->redDither)
1655
            );
1656
            return;
1657
        case PIX_FMT_BGR565:
1658
            asm volatile(
1659
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1660
            "mov        %4, %%"REG_b"               \n\t"
1661
            "push %%"REG_BP"                        \n\t"
1662
            YSCALEYUV2RGB1b(%%REGBP, %5)
1663
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1664
#ifdef DITHER1XBPP
1665
            "paddusb "MANGLE(b5Dither)", %%mm2      \n\t"
1666
            "paddusb "MANGLE(g6Dither)", %%mm4      \n\t"
1667
            "paddusb "MANGLE(r5Dither)", %%mm5      \n\t"
1668
#endif
1669

    
1670
            WRITEBGR16(%%REGb, 8280(%5), %%REGBP)
1671
            "pop %%"REG_BP"                         \n\t"
1672
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1673

    
1674
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1675
            "a" (&c->redDither)
1676
            );
1677
            return;
1678
        case PIX_FMT_YUYV422:
1679
            asm volatile(
1680
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1681
            "mov        %4, %%"REG_b"               \n\t"
1682
            "push %%"REG_BP"                        \n\t"
1683
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1684
            WRITEYUY2(%%REGb, 8280(%5), %%REGBP)
1685
            "pop %%"REG_BP"                         \n\t"
1686
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1687

    
1688
            :: "c" (buf0), "d" (buf1), "S" (uvbuf0), "D" (uvbuf1), "m" (dest),
1689
            "a" (&c->redDither)
1690
            );
1691
            return;
1692
        }
1693
    }
1694
#endif /* HAVE_MMX */
1695
    if (uvalpha < 2048)
1696
    {
1697
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1_C, YSCALE_YUV_2_PACKED1_C)
1698
    }else{
1699
        YSCALE_YUV_2_ANYRGB_C(YSCALE_YUV_2_RGB1B_C, YSCALE_YUV_2_PACKED1B_C)
1700
    }
1701
}
1702

    
1703
//FIXME yuy2* can read upto 7 samples to much
1704

    
1705
static inline void RENAME(yuy2ToY)(uint8_t *dst, uint8_t *src, long width)
1706
{
1707
#ifdef HAVE_MMX
1708
    asm volatile(
1709
    "movq "MANGLE(bm01010101)", %%mm2           \n\t"
1710
    "mov                    %0, %%"REG_a"       \n\t"
1711
    "1:                                         \n\t"
1712
    "movq    (%1, %%"REG_a",2), %%mm0           \n\t"
1713
    "movq   8(%1, %%"REG_a",2), %%mm1           \n\t"
1714
    "pand                %%mm2, %%mm0           \n\t"
1715
    "pand                %%mm2, %%mm1           \n\t"
1716
    "packuswb            %%mm1, %%mm0           \n\t"
1717
    "movq                %%mm0, (%2, %%"REG_a") \n\t"
1718
    "add                    $8, %%"REG_a"       \n\t"
1719
    " js                    1b                  \n\t"
1720
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1721
    : "%"REG_a
1722
    );
1723
#else
1724
    int i;
1725
    for (i=0; i<width; i++)
1726
        dst[i]= src[2*i];
1727
#endif
1728
}
1729

    
1730
static inline void RENAME(yuy2ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1731
{
1732
#ifdef HAVE_MMX
1733
    asm volatile(
1734
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1735
    "mov                    %0, %%"REG_a"       \n\t"
1736
    "1:                                         \n\t"
1737
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1738
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1739
    "psrlw                  $8, %%mm0           \n\t"
1740
    "psrlw                  $8, %%mm1           \n\t"
1741
    "packuswb            %%mm1, %%mm0           \n\t"
1742
    "movq                %%mm0, %%mm1           \n\t"
1743
    "psrlw                  $8, %%mm0           \n\t"
1744
    "pand                %%mm4, %%mm1           \n\t"
1745
    "packuswb            %%mm0, %%mm0           \n\t"
1746
    "packuswb            %%mm1, %%mm1           \n\t"
1747
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1748
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1749
    "add                    $4, %%"REG_a"       \n\t"
1750
    " js                    1b                  \n\t"
1751
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1752
    : "%"REG_a
1753
    );
1754
#else
1755
    int i;
1756
    for (i=0; i<width; i++)
1757
    {
1758
        dstU[i]= src1[4*i + 1];
1759
        dstV[i]= src1[4*i + 3];
1760
    }
1761
#endif
1762
    assert(src1 == src2);
1763
}
1764

    
1765
//this is allmost identical to the previous, end exists only cuz yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses
1766
static inline void RENAME(uyvyToY)(uint8_t *dst, uint8_t *src, long width)
1767
{
1768
#ifdef HAVE_MMX
1769
    asm volatile(
1770
    "mov                  %0, %%"REG_a"         \n\t"
1771
    "1:                                         \n\t"
1772
    "movq  (%1, %%"REG_a",2), %%mm0             \n\t"
1773
    "movq 8(%1, %%"REG_a",2), %%mm1             \n\t"
1774
    "psrlw                $8, %%mm0             \n\t"
1775
    "psrlw                $8, %%mm1             \n\t"
1776
    "packuswb          %%mm1, %%mm0             \n\t"
1777
    "movq              %%mm0, (%2, %%"REG_a")   \n\t"
1778
    "add                  $8, %%"REG_a"         \n\t"
1779
    " js                  1b                    \n\t"
1780
    : : "g" (-width), "r" (src+width*2), "r" (dst+width)
1781
    : "%"REG_a
1782
    );
1783
#else
1784
    int i;
1785
    for (i=0; i<width; i++)
1786
        dst[i]= src[2*i+1];
1787
#endif
1788
}
1789

    
1790
static inline void RENAME(uyvyToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1791
{
1792
#ifdef HAVE_MMX
1793
    asm volatile(
1794
    "movq "MANGLE(bm01010101)", %%mm4           \n\t"
1795
    "mov                    %0, %%"REG_a"       \n\t"
1796
    "1:                                         \n\t"
1797
    "movq    (%1, %%"REG_a",4), %%mm0           \n\t"
1798
    "movq   8(%1, %%"REG_a",4), %%mm1           \n\t"
1799
    "pand                %%mm4, %%mm0           \n\t"
1800
    "pand                %%mm4, %%mm1           \n\t"
1801
    "packuswb            %%mm1, %%mm0           \n\t"
1802
    "movq                %%mm0, %%mm1           \n\t"
1803
    "psrlw                  $8, %%mm0           \n\t"
1804
    "pand                %%mm4, %%mm1           \n\t"
1805
    "packuswb            %%mm0, %%mm0           \n\t"
1806
    "packuswb            %%mm1, %%mm1           \n\t"
1807
    "movd                %%mm0, (%3, %%"REG_a") \n\t"
1808
    "movd                %%mm1, (%2, %%"REG_a") \n\t"
1809
    "add                    $4, %%"REG_a"       \n\t"
1810
    " js                    1b                  \n\t"
1811
    : : "g" (-width), "r" (src1+width*4), "r" (dstU+width), "r" (dstV+width)
1812
    : "%"REG_a
1813
    );
1814
#else
1815
    int i;
1816
    for (i=0; i<width; i++)
1817
    {
1818
        dstU[i]= src1[4*i + 0];
1819
        dstV[i]= src1[4*i + 2];
1820
    }
1821
#endif
1822
    assert(src1 == src2);
1823
}
1824

    
1825
static inline void RENAME(bgr32ToY)(uint8_t *dst, uint8_t *src, int width)
1826
{
1827
    int i;
1828
    for (i=0; i<width; i++)
1829
    {
1830
        int b=  ((uint32_t*)src)[i]&0xFF;
1831
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
1832
        int r= (((uint32_t*)src)[i]>>16)&0xFF;
1833

    
1834
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1835
    }
1836
}
1837

    
1838
static inline void RENAME(bgr32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
1839
{
1840
    int i;
1841
    assert(src1 == src2);
1842
    for (i=0; i<width; i++)
1843
    {
1844
        const int a= ((uint32_t*)src1)[2*i+0];
1845
        const int e= ((uint32_t*)src1)[2*i+1];
1846
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
1847
        const int h= (a&0x00FF00) + (e&0x00FF00);
1848
        const int b=  l&0x3FF;
1849
        const int g=  h>>8;
1850
        const int r=  l>>16;
1851

    
1852
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
1853
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
1854
    }
1855
}
1856

    
1857
static inline void RENAME(bgr24ToY)(uint8_t *dst, uint8_t *src, long width)
1858
{
1859
#ifdef HAVE_MMX
1860
    asm volatile(
1861
    "mov                        %2, %%"REG_a"   \n\t"
1862
    "movq  "MANGLE(ff_bgr2YCoeff)", %%mm6       \n\t"
1863
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1864
    "pxor                    %%mm7, %%mm7       \n\t"
1865
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1866
    ASMALIGN(4)
1867
    "1:                                         \n\t"
1868
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1869
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1870
    "movd         3(%0, %%"REG_d"), %%mm1       \n\t"
1871
    "punpcklbw               %%mm7, %%mm0       \n\t"
1872
    "punpcklbw               %%mm7, %%mm1       \n\t"
1873
    "movd         6(%0, %%"REG_d"), %%mm2       \n\t"
1874
    "movd         9(%0, %%"REG_d"), %%mm3       \n\t"
1875
    "punpcklbw               %%mm7, %%mm2       \n\t"
1876
    "punpcklbw               %%mm7, %%mm3       \n\t"
1877
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1878
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1879
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1880
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1881
#ifndef FAST_BGR2YV12
1882
    "psrad                      $8, %%mm0       \n\t"
1883
    "psrad                      $8, %%mm1       \n\t"
1884
    "psrad                      $8, %%mm2       \n\t"
1885
    "psrad                      $8, %%mm3       \n\t"
1886
#endif
1887
    "packssdw                %%mm1, %%mm0       \n\t"
1888
    "packssdw                %%mm3, %%mm2       \n\t"
1889
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1890
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1891
    "packssdw                %%mm2, %%mm0       \n\t"
1892
    "psraw                      $7, %%mm0       \n\t"
1893

    
1894
    "movd        12(%0, %%"REG_d"), %%mm4       \n\t"
1895
    "movd        15(%0, %%"REG_d"), %%mm1       \n\t"
1896
    "punpcklbw               %%mm7, %%mm4       \n\t"
1897
    "punpcklbw               %%mm7, %%mm1       \n\t"
1898
    "movd        18(%0, %%"REG_d"), %%mm2       \n\t"
1899
    "movd        21(%0, %%"REG_d"), %%mm3       \n\t"
1900
    "punpcklbw               %%mm7, %%mm2       \n\t"
1901
    "punpcklbw               %%mm7, %%mm3       \n\t"
1902
    "pmaddwd                 %%mm6, %%mm4       \n\t"
1903
    "pmaddwd                 %%mm6, %%mm1       \n\t"
1904
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1905
    "pmaddwd                 %%mm6, %%mm3       \n\t"
1906
#ifndef FAST_BGR2YV12
1907
    "psrad                      $8, %%mm4       \n\t"
1908
    "psrad                      $8, %%mm1       \n\t"
1909
    "psrad                      $8, %%mm2       \n\t"
1910
    "psrad                      $8, %%mm3       \n\t"
1911
#endif
1912
    "packssdw                %%mm1, %%mm4       \n\t"
1913
    "packssdw                %%mm3, %%mm2       \n\t"
1914
    "pmaddwd                 %%mm5, %%mm4       \n\t"
1915
    "pmaddwd                 %%mm5, %%mm2       \n\t"
1916
    "add                       $24, %%"REG_d"   \n\t"
1917
    "packssdw                %%mm2, %%mm4       \n\t"
1918
    "psraw                      $7, %%mm4       \n\t"
1919

    
1920
    "packuswb                %%mm4, %%mm0       \n\t"
1921
    "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1922

    
1923
    "movq                    %%mm0, (%1, %%"REG_a") \n\t"
1924
    "add                        $8, %%"REG_a"   \n\t"
1925
    " js                        1b              \n\t"
1926
    : : "r" (src+width*3), "r" (dst+width), "g" (-width)
1927
    : "%"REG_a, "%"REG_d
1928
    );
1929
#else
1930
    int i;
1931
    for (i=0; i<width; i++)
1932
    {
1933
        int b= src[i*3+0];
1934
        int g= src[i*3+1];
1935
        int r= src[i*3+2];
1936

    
1937
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1938
    }
1939
#endif /* HAVE_MMX */
1940
}
1941

    
1942
static inline void RENAME(bgr24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, long width)
1943
{
1944
#ifdef HAVE_MMX
1945
    asm volatile(
1946
    "mov                        %3, %%"REG_a"   \n\t"
1947
    "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1948
    "movq  "MANGLE(ff_bgr2UCoeff)", %%mm6       \n\t"
1949
    "pxor                    %%mm7, %%mm7       \n\t"
1950
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"   \n\t"
1951
    "add                 %%"REG_d", %%"REG_d"   \n\t"
1952
    ASMALIGN(4)
1953
    "1:                                         \n\t"
1954
    PREFETCH" 64(%0, %%"REG_d")                 \n\t"
1955
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1956
    "movq          (%0, %%"REG_d"), %%mm0       \n\t"
1957
    "movq         6(%0, %%"REG_d"), %%mm2       \n\t"
1958
    "movq                    %%mm0, %%mm1       \n\t"
1959
    "movq                    %%mm2, %%mm3       \n\t"
1960
    "psrlq                     $24, %%mm0       \n\t"
1961
    "psrlq                     $24, %%mm2       \n\t"
1962
    PAVGB(%%mm1, %%mm0)
1963
    PAVGB(%%mm3, %%mm2)
1964
    "punpcklbw               %%mm7, %%mm0       \n\t"
1965
    "punpcklbw               %%mm7, %%mm2       \n\t"
1966
#else
1967
    "movd          (%0, %%"REG_d"), %%mm0       \n\t"
1968
    "movd         3(%0, %%"REG_d"), %%mm2       \n\t"
1969
    "punpcklbw               %%mm7, %%mm0       \n\t"
1970
    "punpcklbw               %%mm7, %%mm2       \n\t"
1971
    "paddw                   %%mm2, %%mm0       \n\t"
1972
    "movd         6(%0, %%"REG_d"), %%mm4       \n\t"
1973
    "movd         9(%0, %%"REG_d"), %%mm2       \n\t"
1974
    "punpcklbw               %%mm7, %%mm4       \n\t"
1975
    "punpcklbw               %%mm7, %%mm2       \n\t"
1976
    "paddw                   %%mm4, %%mm2       \n\t"
1977
    "psrlw                      $1, %%mm0       \n\t"
1978
    "psrlw                      $1, %%mm2       \n\t"
1979
#endif
1980
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
1981
    "movq  "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
1982

    
1983
    "pmaddwd                 %%mm0, %%mm1       \n\t"
1984
    "pmaddwd                 %%mm2, %%mm3       \n\t"
1985
    "pmaddwd                 %%mm6, %%mm0       \n\t"
1986
    "pmaddwd                 %%mm6, %%mm2       \n\t"
1987
#ifndef FAST_BGR2YV12
1988
    "psrad                      $8, %%mm0       \n\t"
1989
    "psrad                      $8, %%mm1       \n\t"
1990
    "psrad                      $8, %%mm2       \n\t"
1991
    "psrad                      $8, %%mm3       \n\t"
1992
#endif
1993
    "packssdw                %%mm2, %%mm0       \n\t"
1994
    "packssdw                %%mm3, %%mm1       \n\t"
1995
    "pmaddwd                 %%mm5, %%mm0       \n\t"
1996
    "pmaddwd                 %%mm5, %%mm1       \n\t"
1997
    "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1998
    "psraw                      $7, %%mm0       \n\t"
1999

    
2000
#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2001
    "movq       12(%0, %%"REG_d"), %%mm4       \n\t"
2002
    "movq       18(%0, %%"REG_d"), %%mm2       \n\t"
2003
    "movq                   %%mm4, %%mm1       \n\t"
2004
    "movq                   %%mm2, %%mm3       \n\t"
2005
    "psrlq                    $24, %%mm4       \n\t"
2006
    "psrlq                    $24, %%mm2       \n\t"
2007
    PAVGB(%%mm1, %%mm4)
2008
    PAVGB(%%mm3, %%mm2)
2009
    "punpcklbw              %%mm7, %%mm4       \n\t"
2010
    "punpcklbw              %%mm7, %%mm2       \n\t"
2011
#else
2012
    "movd       12(%0, %%"REG_d"), %%mm4       \n\t"
2013
    "movd       15(%0, %%"REG_d"), %%mm2       \n\t"
2014
    "punpcklbw              %%mm7, %%mm4       \n\t"
2015
    "punpcklbw              %%mm7, %%mm2       \n\t"
2016
    "paddw                  %%mm2, %%mm4       \n\t"
2017
    "movd       18(%0, %%"REG_d"), %%mm5       \n\t"
2018
    "movd       21(%0, %%"REG_d"), %%mm2       \n\t"
2019
    "punpcklbw              %%mm7, %%mm5       \n\t"
2020
    "punpcklbw              %%mm7, %%mm2       \n\t"
2021
    "paddw                  %%mm5, %%mm2       \n\t"
2022
    "movq      "MANGLE(ff_w1111)", %%mm5       \n\t"
2023
    "psrlw                     $2, %%mm4       \n\t"
2024
    "psrlw                     $2, %%mm2       \n\t"
2025
#endif
2026
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm1       \n\t"
2027
    "movq "MANGLE(ff_bgr2VCoeff)", %%mm3       \n\t"
2028

    
2029
    "pmaddwd                %%mm4, %%mm1       \n\t"
2030
    "pmaddwd                %%mm2, %%mm3       \n\t"
2031
    "pmaddwd                %%mm6, %%mm4       \n\t"
2032
    "pmaddwd                %%mm6, %%mm2       \n\t"
2033
#ifndef FAST_BGR2YV12
2034
    "psrad                     $8, %%mm4       \n\t"
2035
    "psrad                     $8, %%mm1       \n\t"
2036
    "psrad                     $8, %%mm2       \n\t"
2037
    "psrad                     $8, %%mm3       \n\t"
2038
#endif
2039
    "packssdw               %%mm2, %%mm4       \n\t"
2040
    "packssdw               %%mm3, %%mm1       \n\t"
2041
    "pmaddwd                %%mm5, %%mm4       \n\t"
2042
    "pmaddwd                %%mm5, %%mm1       \n\t"
2043
    "add                      $24, %%"REG_d"   \n\t"
2044
    "packssdw               %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
2045
    "psraw                     $7, %%mm4       \n\t"
2046

    
2047
    "movq                   %%mm0, %%mm1       \n\t"
2048
    "punpckldq              %%mm4, %%mm0       \n\t"
2049
    "punpckhdq              %%mm4, %%mm1       \n\t"
2050
    "packsswb               %%mm1, %%mm0       \n\t"
2051
    "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0    \n\t"
2052

    
2053
    "movd                   %%mm0, (%1, %%"REG_a")  \n\t"
2054
    "punpckhdq              %%mm0, %%mm0            \n\t"
2055
    "movd                   %%mm0, (%2, %%"REG_a")  \n\t"
2056
    "add                       $4, %%"REG_a"        \n\t"
2057
    " js                       1b                   \n\t"
2058
    : : "r" (src1+width*6), "r" (dstU+width), "r" (dstV+width), "g" (-width)
2059
    : "%"REG_a, "%"REG_d
2060
    );
2061
#else
2062
    int i;
2063
    for (i=0; i<width; i++)
2064
    {
2065
        int b= src1[6*i + 0] + src1[6*i + 3];
2066
        int g= src1[6*i + 1] + src1[6*i + 4];
2067
        int r= src1[6*i + 2] + src1[6*i + 5];
2068

    
2069
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2070
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2071
    }
2072
#endif /* HAVE_MMX */
2073
    assert(src1 == src2);
2074
}
2075

    
2076
static inline void RENAME(rgb16ToY)(uint8_t *dst, uint8_t *src, int width)
2077
{
2078
    int i;
2079
    for (i=0; i<width; i++)
2080
    {
2081
        int d= ((uint16_t*)src)[i];
2082
        int b= d&0x1F;
2083
        int g= (d>>5)&0x3F;
2084
        int r= (d>>11)&0x1F;
2085

    
2086
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2087
    }
2088
}
2089

    
2090
static inline void RENAME(rgb16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2091
{
2092
    int i;
2093
    assert(src1==src2);
2094
    for (i=0; i<width; i++)
2095
    {
2096
        int d0= ((uint32_t*)src1)[i];
2097

    
2098
        int dl= (d0&0x07E0F81F);
2099
        int dh= ((d0>>5)&0x07C0F83F);
2100

    
2101
        int dh2= (dh>>11) + (dh<<21);
2102
        int d= dh2 + dl;
2103

    
2104
        int b= d&0x7F;
2105
        int r= (d>>11)&0x7F;
2106
        int g= d>>21;
2107
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2108
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2109
    }
2110
}
2111

    
2112
static inline void RENAME(rgb15ToY)(uint8_t *dst, uint8_t *src, int width)
2113
{
2114
    int i;
2115
    for (i=0; i<width; i++)
2116
    {
2117
        int d= ((uint16_t*)src)[i];
2118
        int b= d&0x1F;
2119
        int g= (d>>5)&0x1F;
2120
        int r= (d>>10)&0x1F;
2121

    
2122
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2123
    }
2124
}
2125

    
2126
static inline void RENAME(rgb15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2127
{
2128
    int i;
2129
    assert(src1==src2);
2130
    for (i=0; i<width; i++)
2131
    {
2132
        int d0= ((uint32_t*)src1)[i];
2133

    
2134
        int dl= (d0&0x03E07C1F);
2135
        int dh= ((d0>>5)&0x03E0F81F);
2136

    
2137
        int dh2= (dh>>11) + (dh<<21);
2138
        int d= dh2 + dl;
2139

    
2140
        int b= d&0x7F;
2141
        int r= (d>>10)&0x7F;
2142
        int g= d>>21;
2143
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2144
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2145
    }
2146
}
2147

    
2148

    
2149
static inline void RENAME(rgb32ToY)(uint8_t *dst, uint8_t *src, int width)
2150
{
2151
    int i;
2152
    for (i=0; i<width; i++)
2153
    {
2154
        int r=  ((uint32_t*)src)[i]&0xFF;
2155
        int g= (((uint32_t*)src)[i]>>8)&0xFF;
2156
        int b= (((uint32_t*)src)[i]>>16)&0xFF;
2157

    
2158
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2159
    }
2160
}
2161

    
2162
static inline void RENAME(rgb32ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2163
{
2164
    int i;
2165
    assert(src1==src2);
2166
    for (i=0; i<width; i++)
2167
    {
2168
        const int a= ((uint32_t*)src1)[2*i+0];
2169
        const int e= ((uint32_t*)src1)[2*i+1];
2170
        const int l= (a&0xFF00FF) + (e&0xFF00FF);
2171
        const int h= (a&0x00FF00) + (e&0x00FF00);
2172
        const int r=  l&0x3FF;
2173
        const int g=  h>>8;
2174
        const int b=  l>>16;
2175

    
2176
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2177
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2178
    }
2179
}
2180

    
2181
static inline void RENAME(rgb24ToY)(uint8_t *dst, uint8_t *src, int width)
2182
{
2183
    int i;
2184
    for (i=0; i<width; i++)
2185
    {
2186
        int r= src[i*3+0];
2187
        int g= src[i*3+1];
2188
        int b= src[i*3+2];
2189

    
2190
        dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
2191
    }
2192
}
2193

    
2194
static inline void RENAME(rgb24ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2195
{
2196
    int i;
2197
    assert(src1==src2);
2198
    for (i=0; i<width; i++)
2199
    {
2200
        int r= src1[6*i + 0] + src1[6*i + 3];
2201
        int g= src1[6*i + 1] + src1[6*i + 4];
2202
        int b= src1[6*i + 2] + src1[6*i + 5];
2203

    
2204
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1)) + 128;
2205
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1)) + 128;
2206
    }
2207
}
2208

    
2209
static inline void RENAME(bgr16ToY)(uint8_t *dst, uint8_t *src, int width)
2210
{
2211
    int i;
2212
    for (i=0; i<width; i++)
2213
    {
2214
        int d= ((uint16_t*)src)[i];
2215
        int r= d&0x1F;
2216
        int g= (d>>5)&0x3F;
2217
        int b= (d>>11)&0x1F;
2218

    
2219
        dst[i]= ((2*RY*r + GY*g + 2*BY*b)>>(RGB2YUV_SHIFT-2)) + 16;
2220
    }
2221
}
2222

    
2223
static inline void RENAME(bgr16ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2224
{
2225
    int i;
2226
    assert(src1 == src2);
2227
    for (i=0; i<width; i++)
2228
    {
2229
        int d0= ((uint32_t*)src1)[i];
2230

    
2231
        int dl= (d0&0x07E0F81F);
2232
        int d= dl + (((d0>>16) + (d0<<16))&0x07E0F81F);
2233

    
2234
        int r= d&0x3F;
2235
        int b= (d>>11)&0x3F;
2236
        int g= d>>21;
2237
        dstU[i]= ((2*RU*r + GU*g + 2*BU*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2238
        dstV[i]= ((2*RV*r + GV*g + 2*BV*b)>>(RGB2YUV_SHIFT+1-2)) + 128;
2239
    }
2240
}
2241

    
2242
static inline void RENAME(bgr15ToY)(uint8_t *dst, uint8_t *src, int width)
2243
{
2244
    int i;
2245
    for (i=0; i<width; i++)
2246
    {
2247
        int d= ((uint16_t*)src)[i];
2248
        int r= d&0x1F;
2249
        int g= (d>>5)&0x1F;
2250
        int b= (d>>10)&0x1F;
2251

    
2252
        dst[i]= ((RY*r + GY*g + BY*b)>>(RGB2YUV_SHIFT-3)) + 16;
2253
    }
2254
}
2255

    
2256
static inline void RENAME(bgr15ToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width)
2257
{
2258
    int i;
2259
    assert(src1 == src2);
2260
    for (i=0; i<width; i++)
2261
    {
2262
        int d0= ((uint32_t*)src1)[i];
2263

    
2264
        int dl= (d0&0x03E07C1F);
2265
        int d= dl + (((d0>>16) + (d0<<16))&0x03E07C1F);
2266

    
2267
        int r= d&0x3F;
2268
        int b= (d>>10)&0x3F;
2269
        int g= d>>21;
2270
        dstU[i]= ((RU*r + GU*g + BU*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2271
        dstV[i]= ((RV*r + GV*g + BV*b)>>(RGB2YUV_SHIFT+1-3)) + 128;
2272
    }
2273
}
2274

    
2275
static inline void RENAME(palToY)(uint8_t *dst, uint8_t *src, int width, uint32_t *pal)
2276
{
2277
    int i;
2278
    for (i=0; i<width; i++)
2279
    {
2280
        int d= src[i];
2281

    
2282
        dst[i]= pal[d] & 0xFF;
2283
    }
2284
}
2285

    
2286
static inline void RENAME(palToUV)(uint8_t *dstU, uint8_t *dstV, uint8_t *src1, uint8_t *src2, int width, uint32_t *pal)
2287
{
2288
    int i;
2289
    assert(src1 == src2);
2290
    for (i=0; i<width; i++)
2291
    {
2292
        int p= pal[src1[i]];
2293

    
2294
        dstU[i]= p>>8;
2295
        dstV[i]= p>>16;
2296
    }
2297
}
2298

    
2299
// Bilinear / Bicubic scaling
2300
static inline void RENAME(hScale)(int16_t *dst, int dstW, uint8_t *src, int srcW, int xInc,
2301
                                  int16_t *filter, int16_t *filterPos, long filterSize)
2302
{
2303
#ifdef HAVE_MMX
2304
    assert(filterSize % 4 == 0 && filterSize>0);
2305
    if (filterSize==4) // Always true for upscaling, sometimes for down, too.
2306
    {
2307
        long counter= -2*dstW;
2308
        filter-= counter*2;
2309
        filterPos-= counter/2;
2310
        dst-= counter/2;
2311
        asm volatile(
2312
#if defined(PIC)
2313
        "push            %%"REG_b"              \n\t"
2314
#endif
2315
        "pxor                %%mm7, %%mm7       \n\t"
2316
        "movq        "MANGLE(w02)", %%mm6       \n\t"
2317
        "push           %%"REG_BP"              \n\t" // we use 7 regs here ...
2318
        "mov             %%"REG_a", %%"REG_BP"  \n\t"
2319
        ASMALIGN(4)
2320
        "1:                                     \n\t"
2321
        "movzwl   (%2, %%"REG_BP"), %%eax       \n\t"
2322
        "movzwl  2(%2, %%"REG_BP"), %%ebx       \n\t"
2323
        "movq  (%1, %%"REG_BP", 4), %%mm1       \n\t"
2324
        "movq 8(%1, %%"REG_BP", 4), %%mm3       \n\t"
2325
        "movd      (%3, %%"REG_a"), %%mm0       \n\t"
2326
        "movd      (%3, %%"REG_b"), %%mm2       \n\t"
2327
        "punpcklbw           %%mm7, %%mm0       \n\t"
2328
        "punpcklbw           %%mm7, %%mm2       \n\t"
2329
        "pmaddwd             %%mm1, %%mm0       \n\t"
2330
        "pmaddwd             %%mm2, %%mm3       \n\t"
2331
        "psrad                  $8, %%mm0       \n\t"
2332
        "psrad                  $8, %%mm3       \n\t"
2333
        "packssdw            %%mm3, %%mm0       \n\t"
2334
        "pmaddwd             %%mm6, %%mm0       \n\t"
2335
        "packssdw            %%mm0, %%mm0       \n\t"
2336
        "movd                %%mm0, (%4, %%"REG_BP")    \n\t"
2337
        "add                    $4, %%"REG_BP"  \n\t"
2338
        " jnc                   1b              \n\t"
2339

    
2340
        "pop            %%"REG_BP"              \n\t"
2341
#if defined(PIC)
2342
        "pop             %%"REG_b"              \n\t"
2343
#endif
2344
        : "+a" (counter)
2345
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2346
#if !defined(PIC)
2347
        : "%"REG_b
2348
#endif
2349
        );
2350
    }
2351
    else if (filterSize==8)
2352
    {
2353
        long counter= -2*dstW;
2354
        filter-= counter*4;
2355
        filterPos-= counter/2;
2356
        dst-= counter/2;
2357
        asm volatile(
2358
#if defined(PIC)
2359
        "push             %%"REG_b"             \n\t"
2360
#endif
2361
        "pxor                 %%mm7, %%mm7      \n\t"
2362
        "movq         "MANGLE(w02)", %%mm6      \n\t"
2363
        "push            %%"REG_BP"             \n\t" // we use 7 regs here ...
2364
        "mov              %%"REG_a", %%"REG_BP" \n\t"
2365
        ASMALIGN(4)
2366
        "1:                                     \n\t"
2367
        "movzwl    (%2, %%"REG_BP"), %%eax      \n\t"
2368
        "movzwl   2(%2, %%"REG_BP"), %%ebx      \n\t"
2369
        "movq   (%1, %%"REG_BP", 8), %%mm1      \n\t"
2370
        "movq 16(%1, %%"REG_BP", 8), %%mm3      \n\t"
2371
        "movd       (%3, %%"REG_a"), %%mm0      \n\t"
2372
        "movd       (%3, %%"REG_b"), %%mm2      \n\t"
2373
        "punpcklbw            %%mm7, %%mm0      \n\t"
2374
        "punpcklbw            %%mm7, %%mm2      \n\t"
2375
        "pmaddwd              %%mm1, %%mm0      \n\t"
2376
        "pmaddwd              %%mm2, %%mm3      \n\t"
2377

    
2378
        "movq  8(%1, %%"REG_BP", 8), %%mm1      \n\t"
2379
        "movq 24(%1, %%"REG_BP", 8), %%mm5      \n\t"
2380
        "movd      4(%3, %%"REG_a"), %%mm4      \n\t"
2381
        "movd      4(%3, %%"REG_b"), %%mm2      \n\t"
2382
        "punpcklbw            %%mm7, %%mm4      \n\t"
2383
        "punpcklbw            %%mm7, %%mm2      \n\t"
2384
        "pmaddwd              %%mm1, %%mm4      \n\t"
2385
        "pmaddwd              %%mm2, %%mm5      \n\t"
2386
        "paddd                %%mm4, %%mm0      \n\t"
2387
        "paddd                %%mm5, %%mm3      \n\t"
2388

    
2389
        "psrad                   $8, %%mm0      \n\t"
2390
        "psrad                   $8, %%mm3      \n\t"
2391
        "packssdw             %%mm3, %%mm0      \n\t"
2392
        "pmaddwd              %%mm6, %%mm0      \n\t"
2393
        "packssdw             %%mm0, %%mm0      \n\t"
2394
        "movd                 %%mm0, (%4, %%"REG_BP")   \n\t"
2395
        "add                     $4, %%"REG_BP" \n\t"
2396
        " jnc                    1b             \n\t"
2397

    
2398
        "pop             %%"REG_BP"             \n\t"
2399
#if defined(PIC)
2400
        "pop              %%"REG_b"             \n\t"
2401
#endif
2402
        : "+a" (counter)
2403
        : "c" (filter), "d" (filterPos), "S" (src), "D" (dst)
2404
#if !defined(PIC)
2405
        : "%"REG_b
2406
#endif
2407
        );
2408
    }
2409
    else
2410
    {
2411
        uint8_t *offset = src+filterSize;
2412
        long counter= -2*dstW;
2413
        //filter-= counter*filterSize/2;
2414
        filterPos-= counter/2;
2415
        dst-= counter/2;
2416
        asm volatile(
2417
        "pxor                  %%mm7, %%mm7     \n\t"
2418
        "movq          "MANGLE(w02)", %%mm6     \n\t"
2419
        ASMALIGN(4)
2420
        "1:                                     \n\t"
2421
        "mov                      %2, %%"REG_c" \n\t"
2422
        "movzwl      (%%"REG_c", %0), %%eax     \n\t"
2423
        "movzwl     2(%%"REG_c", %0), %%edx     \n\t"
2424
        "mov                      %5, %%"REG_c" \n\t"
2425
        "pxor                  %%mm4, %%mm4     \n\t"
2426
        "pxor                  %%mm5, %%mm5     \n\t"
2427
        "2:                                     \n\t"
2428
        "movq                   (%1), %%mm1     \n\t"
2429
        "movq               (%1, %6), %%mm3     \n\t"
2430
        "movd (%%"REG_c", %%"REG_a"), %%mm0     \n\t"
2431
        "movd (%%"REG_c", %%"REG_d"), %%mm2     \n\t"
2432
        "punpcklbw             %%mm7, %%mm0     \n\t"
2433
        "punpcklbw             %%mm7, %%mm2     \n\t"
2434
        "pmaddwd               %%mm1, %%mm0     \n\t"
2435
        "pmaddwd               %%mm2, %%mm3     \n\t"
2436
        "paddd                 %%mm3, %%mm5     \n\t"
2437
        "paddd                 %%mm0, %%mm4     \n\t"
2438
        "add                      $8, %1        \n\t"
2439
        "add                      $4, %%"REG_c" \n\t"
2440
        "cmp                      %4, %%"REG_c" \n\t"
2441
        " jb                      2b            \n\t"
2442
        "add                      %6, %1        \n\t"
2443
        "psrad                    $8, %%mm4     \n\t"
2444
        "psrad                    $8, %%mm5     \n\t"
2445
        "packssdw              %%mm5, %%mm4     \n\t"
2446
        "pmaddwd               %%mm6, %%mm4     \n\t"
2447
        "packssdw              %%mm4, %%mm4     \n\t"
2448
        "mov                      %3, %%"REG_a" \n\t"
2449
        "movd                  %%mm4, (%%"REG_a", %0)   \n\t"
2450
        "add                      $4, %0        \n\t"
2451
        " jnc                     1b            \n\t"
2452

    
2453
        : "+r" (counter), "+r" (filter)
2454
        : "m" (filterPos), "m" (dst), "m"(offset),
2455
          "m" (src), "r" (filterSize*2)
2456
        : "%"REG_a, "%"REG_c, "%"REG_d
2457
        );
2458
    }
2459
#else
2460
#ifdef HAVE_ALTIVEC
2461
    hScale_altivec_real(dst, dstW, src, srcW, xInc, filter, filterPos, filterSize);
2462
#else
2463
    int i;
2464
    for (i=0; i<dstW; i++)
2465
    {
2466
        int j;
2467
        int srcPos= filterPos[i];
2468
        int val=0;
2469
        //printf("filterPos: %d\n", filterPos[i]);
2470
        for (j=0; j<filterSize; j++)
2471
        {
2472
            //printf("filter: %d, src: %d\n", filter[i], src[srcPos + j]);
2473
            val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2474
        }
2475
        //filter += hFilterSize;
2476
        dst[i] = av_clip(val>>7, 0, (1<<15)-1); // the cubic equation does overflow ...
2477
        //dst[i] = val>>7;
2478
    }
2479
#endif /* HAVE_ALTIVEC */
2480
#endif /* HAVE_MMX */
2481
}
2482
      // *** horizontal scale Y line to temp buffer
2483
static inline void RENAME(hyscale)(uint16_t *dst, long dstWidth, uint8_t *src, int srcW, int xInc,
2484
                                   int flags, int canMMX2BeUsed, int16_t *hLumFilter,
2485
                                   int16_t *hLumFilterPos, int hLumFilterSize, void *funnyYCode,
2486
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2487
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2488
{
2489
    if (srcFormat==PIX_FMT_YUYV422 || srcFormat==PIX_FMT_GRAY16BE)
2490
    {
2491
        RENAME(yuy2ToY)(formatConvBuffer, src, srcW);
2492
        src= formatConvBuffer;
2493
    }
2494
    else if (srcFormat==PIX_FMT_UYVY422 || srcFormat==PIX_FMT_GRAY16LE)
2495
    {
2496
        RENAME(uyvyToY)(formatConvBuffer, src, srcW);
2497
        src= formatConvBuffer;
2498
    }
2499
    else if (srcFormat==PIX_FMT_RGB32)
2500
    {
2501
        RENAME(bgr32ToY)(formatConvBuffer, src, srcW);
2502
        src= formatConvBuffer;
2503
    }
2504
    else if (srcFormat==PIX_FMT_BGR24)
2505
    {
2506
        RENAME(bgr24ToY)(formatConvBuffer, src, srcW);
2507
        src= formatConvBuffer;
2508
    }
2509
    else if (srcFormat==PIX_FMT_BGR565)
2510
    {
2511
        RENAME(bgr16ToY)(formatConvBuffer, src, srcW);
2512
        src= formatConvBuffer;
2513
    }
2514
    else if (srcFormat==PIX_FMT_BGR555)
2515
    {
2516
        RENAME(bgr15ToY)(formatConvBuffer, src, srcW);
2517
        src= formatConvBuffer;
2518
    }
2519
    else if (srcFormat==PIX_FMT_BGR32)
2520
    {
2521
        RENAME(rgb32ToY)(formatConvBuffer, src, srcW);
2522
        src= formatConvBuffer;
2523
    }
2524
    else if (srcFormat==PIX_FMT_RGB24)
2525
    {
2526
        RENAME(rgb24ToY)(formatConvBuffer, src, srcW);
2527
        src= formatConvBuffer;
2528
    }
2529
    else if (srcFormat==PIX_FMT_RGB565)
2530
    {
2531
        RENAME(rgb16ToY)(formatConvBuffer, src, srcW);
2532
        src= formatConvBuffer;
2533
    }
2534
    else if (srcFormat==PIX_FMT_RGB555)
2535
    {
2536
        RENAME(rgb15ToY)(formatConvBuffer, src, srcW);
2537
        src= formatConvBuffer;
2538
    }
2539
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2540
    {
2541
        RENAME(palToY)(formatConvBuffer, src, srcW, pal);
2542
        src= formatConvBuffer;
2543
    }
2544

    
2545
#ifdef HAVE_MMX
2546
    // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2547
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2548
#else
2549
    if (!(flags&SWS_FAST_BILINEAR))
2550
#endif
2551
    {
2552
        RENAME(hScale)(dst, dstWidth, src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize);
2553
    }
2554
    else // Fast Bilinear upscale / crap downscale
2555
    {
2556
#if defined(ARCH_X86)
2557
#ifdef HAVE_MMX2
2558
        int i;
2559
#if defined(PIC)
2560
        uint64_t ebxsave __attribute__((aligned(8)));
2561
#endif
2562
        if (canMMX2BeUsed)
2563
        {
2564
            asm volatile(
2565
#if defined(PIC)
2566
            "mov               %%"REG_b", %5        \n\t"
2567
#endif
2568
            "pxor                  %%mm7, %%mm7     \n\t"
2569
            "mov                      %0, %%"REG_c" \n\t"
2570
            "mov                      %1, %%"REG_D" \n\t"
2571
            "mov                      %2, %%"REG_d" \n\t"
2572
            "mov                      %3, %%"REG_b" \n\t"
2573
            "xor               %%"REG_a", %%"REG_a" \n\t" // i
2574
            PREFETCH"        (%%"REG_c")            \n\t"
2575
            PREFETCH"      32(%%"REG_c")            \n\t"
2576
            PREFETCH"      64(%%"REG_c")            \n\t"
2577

    
2578
#ifdef ARCH_X86_64
2579

    
2580
#define FUNNY_Y_CODE \
2581
            "movl            (%%"REG_b"), %%esi     \n\t"\
2582
            "call                    *%4            \n\t"\
2583
            "movl (%%"REG_b", %%"REG_a"), %%esi     \n\t"\
2584
            "add               %%"REG_S", %%"REG_c" \n\t"\
2585
            "add               %%"REG_a", %%"REG_D" \n\t"\
2586
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2587

    
2588
#else
2589

    
2590
#define FUNNY_Y_CODE \
2591
            "movl (%%"REG_b"), %%esi        \n\t"\
2592
            "call         *%4                       \n\t"\
2593
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2594
            "add               %%"REG_a", %%"REG_D" \n\t"\
2595
            "xor               %%"REG_a", %%"REG_a" \n\t"\
2596

    
2597
#endif /* ARCH_X86_64 */
2598

    
2599
FUNNY_Y_CODE
2600
FUNNY_Y_CODE
2601
FUNNY_Y_CODE
2602
FUNNY_Y_CODE
2603
FUNNY_Y_CODE
2604
FUNNY_Y_CODE
2605
FUNNY_Y_CODE
2606
FUNNY_Y_CODE
2607

    
2608
#if defined(PIC)
2609
            "mov                      %5, %%"REG_b" \n\t"
2610
#endif
2611
            :: "m" (src), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2612
            "m" (funnyYCode)
2613
#if defined(PIC)
2614
            ,"m" (ebxsave)
2615
#endif
2616
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2617
#if !defined(PIC)
2618
            ,"%"REG_b
2619
#endif
2620
            );
2621
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) dst[i] = src[srcW-1]*128;
2622
        }
2623
        else
2624
        {
2625
#endif /* HAVE_MMX2 */
2626
        long xInc_shr16 = xInc >> 16;
2627
        uint16_t xInc_mask = xInc & 0xffff;
2628
        //NO MMX just normal asm ...
2629
        asm volatile(
2630
        "xor %%"REG_a", %%"REG_a"            \n\t" // i
2631
        "xor %%"REG_d", %%"REG_d"            \n\t" // xx
2632
        "xorl    %%ecx, %%ecx                \n\t" // 2*xalpha
2633
        ASMALIGN(4)
2634
        "1:                                  \n\t"
2635
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2636
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2637
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2638
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2639
        "shll      $16, %%edi                \n\t"
2640
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2641
        "mov        %1, %%"REG_D"            \n\t"
2642
        "shrl       $9, %%esi                \n\t"
2643
        "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2644
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2645
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2646

    
2647
        "movzbl    (%0, %%"REG_d"), %%edi    \n\t" //src[xx]
2648
        "movzbl   1(%0, %%"REG_d"), %%esi    \n\t" //src[xx+1]
2649
        "subl    %%edi, %%esi                \n\t" //src[xx+1] - src[xx]
2650
        "imull   %%ecx, %%esi                \n\t" //(src[xx+1] - src[xx])*2*xalpha
2651
        "shll      $16, %%edi                \n\t"
2652
        "addl    %%edi, %%esi                \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2653
        "mov        %1, %%"REG_D"            \n\t"
2654
        "shrl       $9, %%esi                \n\t"
2655
        "movw     %%si, 2(%%"REG_D", %%"REG_a", 2)  \n\t"
2656
        "addw       %4, %%cx                 \n\t" //2*xalpha += xInc&0xFF
2657
        "adc        %3, %%"REG_d"            \n\t" //xx+= xInc>>8 + carry
2658

    
2659

    
2660
        "add        $2, %%"REG_a"            \n\t"
2661
        "cmp        %2, %%"REG_a"            \n\t"
2662
        " jb        1b                       \n\t"
2663

    
2664

    
2665
        :: "r" (src), "m" (dst), "m" (dstWidth), "m" (xInc_shr16), "m" (xInc_mask)
2666
        : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2667
        );
2668
#ifdef HAVE_MMX2
2669
        } //if MMX2 can't be used
2670
#endif
2671
#else
2672
        int i;
2673
        unsigned int xpos=0;
2674
        for (i=0;i<dstWidth;i++)
2675
        {
2676
            register unsigned int xx=xpos>>16;
2677
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2678
            dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2679
            xpos+=xInc;
2680
        }
2681
#endif /* defined(ARCH_X86) */
2682
    }
2683
}
2684

    
2685
inline static void RENAME(hcscale)(uint16_t *dst, long dstWidth, uint8_t *src1, uint8_t *src2,
2686
                                   int srcW, int xInc, int flags, int canMMX2BeUsed, int16_t *hChrFilter,
2687
                                   int16_t *hChrFilterPos, int hChrFilterSize, void *funnyUVCode,
2688
                                   int srcFormat, uint8_t *formatConvBuffer, int16_t *mmx2Filter,
2689
                                   int32_t *mmx2FilterPos, uint8_t *pal)
2690
{
2691
    if (srcFormat==PIX_FMT_YUYV422)
2692
    {
2693
        RENAME(yuy2ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2694
        src1= formatConvBuffer;
2695
        src2= formatConvBuffer+2048;
2696
    }
2697
    else if (srcFormat==PIX_FMT_UYVY422)
2698
    {
2699
        RENAME(uyvyToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2700
        src1= formatConvBuffer;
2701
        src2= formatConvBuffer+2048;
2702
    }
2703
    else if (srcFormat==PIX_FMT_RGB32)
2704
    {
2705
        RENAME(bgr32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2706
        src1= formatConvBuffer;
2707
        src2= formatConvBuffer+2048;
2708
    }
2709
    else if (srcFormat==PIX_FMT_BGR24)
2710
    {
2711
        RENAME(bgr24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2712
        src1= formatConvBuffer;
2713
        src2= formatConvBuffer+2048;
2714
    }
2715
    else if (srcFormat==PIX_FMT_BGR565)
2716
    {
2717
        RENAME(bgr16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2718
        src1= formatConvBuffer;
2719
        src2= formatConvBuffer+2048;
2720
    }
2721
    else if (srcFormat==PIX_FMT_BGR555)
2722
    {
2723
        RENAME(bgr15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2724
        src1= formatConvBuffer;
2725
        src2= formatConvBuffer+2048;
2726
    }
2727
    else if (srcFormat==PIX_FMT_BGR32)
2728
    {
2729
        RENAME(rgb32ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2730
        src1= formatConvBuffer;
2731
        src2= formatConvBuffer+2048;
2732
    }
2733
    else if (srcFormat==PIX_FMT_RGB24)
2734
    {
2735
        RENAME(rgb24ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2736
        src1= formatConvBuffer;
2737
        src2= formatConvBuffer+2048;
2738
    }
2739
    else if (srcFormat==PIX_FMT_RGB565)
2740
    {
2741
        RENAME(rgb16ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2742
        src1= formatConvBuffer;
2743
        src2= formatConvBuffer+2048;
2744
    }
2745
    else if (srcFormat==PIX_FMT_RGB555)
2746
    {
2747
        RENAME(rgb15ToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW);
2748
        src1= formatConvBuffer;
2749
        src2= formatConvBuffer+2048;
2750
    }
2751
    else if (isGray(srcFormat))
2752
    {
2753
        return;
2754
    }
2755
    else if (srcFormat==PIX_FMT_RGB8 || srcFormat==PIX_FMT_BGR8 || srcFormat==PIX_FMT_PAL8 || srcFormat==PIX_FMT_BGR4_BYTE  || srcFormat==PIX_FMT_RGB4_BYTE)
2756
    {
2757
        RENAME(palToUV)(formatConvBuffer, formatConvBuffer+2048, src1, src2, srcW, pal);
2758
        src1= formatConvBuffer;
2759
        src2= formatConvBuffer+2048;
2760
    }
2761

    
2762
#ifdef HAVE_MMX
2763
    // use the new MMX scaler if the mmx2 can't be used (it is faster than the x86 ASM one)
2764
    if (!(flags&SWS_FAST_BILINEAR) || (!canMMX2BeUsed))
2765
#else
2766
    if (!(flags&SWS_FAST_BILINEAR))
2767
#endif
2768
    {
2769
        RENAME(hScale)(dst     , dstWidth, src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2770
        RENAME(hScale)(dst+2048, dstWidth, src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize);
2771
    }
2772
    else // Fast Bilinear upscale / crap downscale
2773
    {
2774
#if defined(ARCH_X86)
2775
#ifdef HAVE_MMX2
2776
        int i;
2777
#if defined(PIC)
2778
        uint64_t ebxsave __attribute__((aligned(8)));
2779
#endif
2780
        if (canMMX2BeUsed)
2781
        {
2782
            asm volatile(
2783
#if defined(PIC)
2784
            "mov          %%"REG_b", %6         \n\t"
2785
#endif
2786
            "pxor             %%mm7, %%mm7      \n\t"
2787
            "mov                 %0, %%"REG_c"  \n\t"
2788
            "mov                 %1, %%"REG_D"  \n\t"
2789
            "mov                 %2, %%"REG_d"  \n\t"
2790
            "mov                 %3, %%"REG_b"  \n\t"
2791
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2792
            PREFETCH"   (%%"REG_c")             \n\t"
2793
            PREFETCH" 32(%%"REG_c")             \n\t"
2794
            PREFETCH" 64(%%"REG_c")             \n\t"
2795

    
2796
#ifdef ARCH_X86_64
2797

    
2798
#define FUNNY_UV_CODE \
2799
            "movl       (%%"REG_b"), %%esi      \n\t"\
2800
            "call               *%4             \n\t"\
2801
            "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
2802
            "add          %%"REG_S", %%"REG_c"  \n\t"\
2803
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2804
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2805

    
2806
#else
2807

    
2808
#define FUNNY_UV_CODE \
2809
            "movl       (%%"REG_b"), %%esi      \n\t"\
2810
            "call               *%4             \n\t"\
2811
            "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
2812
            "add          %%"REG_a", %%"REG_D"  \n\t"\
2813
            "xor          %%"REG_a", %%"REG_a"  \n\t"\
2814

    
2815
#endif /* ARCH_X86_64 */
2816

    
2817
FUNNY_UV_CODE
2818
FUNNY_UV_CODE
2819
FUNNY_UV_CODE
2820
FUNNY_UV_CODE
2821
            "xor          %%"REG_a", %%"REG_a"  \n\t" // i
2822
            "mov                 %5, %%"REG_c"  \n\t" // src
2823
            "mov                 %1, %%"REG_D"  \n\t" // buf1
2824
            "add              $4096, %%"REG_D"  \n\t"
2825
            PREFETCH"   (%%"REG_c")             \n\t"
2826
            PREFETCH" 32(%%"REG_c")             \n\t"
2827
            PREFETCH" 64(%%"REG_c")             \n\t"
2828

    
2829
FUNNY_UV_CODE
2830
FUNNY_UV_CODE
2831
FUNNY_UV_CODE
2832
FUNNY_UV_CODE
2833

    
2834
#if defined(PIC)
2835
            "mov %6, %%"REG_b"    \n\t"
2836
#endif
2837
            :: "m" (src1), "m" (dst), "m" (mmx2Filter), "m" (mmx2FilterPos),
2838
            "m" (funnyUVCode), "m" (src2)
2839
#if defined(PIC)
2840
            ,"m" (ebxsave)
2841
#endif
2842
            : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
2843
#if !defined(PIC)
2844
             ,"%"REG_b
2845
#endif
2846
            );
2847
            for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2848
            {
2849
                //printf("%d %d %d\n", dstWidth, i, srcW);
2850
                dst[i] = src1[srcW-1]*128;
2851
                dst[i+2048] = src2[srcW-1]*128;
2852
            }
2853
        }
2854
        else
2855
        {
2856
#endif /* HAVE_MMX2 */
2857
            long xInc_shr16 = (long) (xInc >> 16);
2858
            uint16_t xInc_mask = xInc & 0xffff;
2859
            asm volatile(
2860
            "xor %%"REG_a", %%"REG_a"               \n\t" // i
2861
            "xor %%"REG_d", %%"REG_d"               \n\t" // xx
2862
            "xorl    %%ecx, %%ecx                   \n\t" // 2*xalpha
2863
            ASMALIGN(4)
2864
            "1:                                     \n\t"
2865
            "mov        %0, %%"REG_S"               \n\t"
2866
            "movzbl  (%%"REG_S", %%"REG_d"), %%edi  \n\t" //src[xx]
2867
            "movzbl 1(%%"REG_S", %%"REG_d"), %%esi  \n\t" //src[xx+1]
2868
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2869
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2870
            "shll      $16, %%edi                   \n\t"
2871
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2872
            "mov        %1, %%"REG_D"               \n\t"
2873
            "shrl       $9, %%esi                   \n\t"
2874
            "movw     %%si, (%%"REG_D", %%"REG_a", 2)   \n\t"
2875

    
2876
            "movzbl    (%5, %%"REG_d"), %%edi       \n\t" //src[xx]
2877
            "movzbl   1(%5, %%"REG_d"), %%esi       \n\t" //src[xx+1]
2878
            "subl    %%edi, %%esi                   \n\t" //src[xx+1] - src[xx]
2879
            "imull   %%ecx, %%esi                   \n\t" //(src[xx+1] - src[xx])*2*xalpha
2880
            "shll      $16, %%edi                   \n\t"
2881
            "addl    %%edi, %%esi                   \n\t" //src[xx+1]*2*xalpha + src[xx]*(1-2*xalpha)
2882
            "mov        %1, %%"REG_D"               \n\t"
2883
            "shrl       $9, %%esi                   \n\t"
2884
            "movw     %%si, 4096(%%"REG_D", %%"REG_a", 2)   \n\t"
2885

    
2886
            "addw       %4, %%cx                    \n\t" //2*xalpha += xInc&0xFF
2887
            "adc        %3, %%"REG_d"               \n\t" //xx+= xInc>>8 + carry
2888
            "add        $1, %%"REG_a"               \n\t"
2889
            "cmp        %2, %%"REG_a"               \n\t"
2890
            " jb        1b                          \n\t"
2891

    
2892
/* GCC-3.3 makes MPlayer crash on IA-32 machines when using "g" operand here,
2893
   which is needed to support GCC-4.0 */
2894
#if defined(ARCH_X86_64) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
2895
            :: "m" (src1), "m" (dst), "g" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2896
#else
2897
            :: "m" (src1), "m" (dst), "m" ((long)dstWidth), "m" (xInc_shr16), "m" (xInc_mask),
2898
#endif
2899
            "r" (src2)
2900
            : "%"REG_a, "%"REG_d, "%ecx", "%"REG_D, "%esi"
2901
            );
2902
#ifdef HAVE_MMX2
2903
        } //if MMX2 can't be used
2904
#endif
2905
#else
2906
        int i;
2907
        unsigned int xpos=0;
2908
        for (i=0;i<dstWidth;i++)
2909
        {
2910
            register unsigned int xx=xpos>>16;
2911
            register unsigned int xalpha=(xpos&0xFFFF)>>9;
2912
            dst[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2913
            dst[i+2048]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2914
            /* slower
2915
            dst[i]= (src1[xx]<<7) + (src1[xx+1] - src1[xx])*xalpha;
2916
            dst[i+2048]=(src2[xx]<<7) + (src2[xx+1] - src2[xx])*xalpha;
2917
            */
2918
            xpos+=xInc;
2919
        }
2920
#endif /* defined(ARCH_X86) */
2921
    }
2922
}
2923

    
2924
static int RENAME(swScale)(SwsContext *c, uint8_t* src[], int srcStride[], int srcSliceY,
2925
                           int srcSliceH, uint8_t* dst[], int dstStride[]){
2926

    
2927
    /* load a few things into local vars to make the code more readable? and faster */
2928
    const int srcW= c->srcW;
2929
    const int dstW= c->dstW;
2930
    const int dstH= c->dstH;
2931
    const int chrDstW= c->chrDstW;
2932
    const int chrSrcW= c->chrSrcW;
2933
    const int lumXInc= c->lumXInc;
2934
    const int chrXInc= c->chrXInc;
2935
    const int dstFormat= c->dstFormat;
2936
    const int srcFormat= c->srcFormat;
2937
    const int flags= c->flags;
2938
    const int canMMX2BeUsed= c->canMMX2BeUsed;
2939
    int16_t *vLumFilterPos= c->vLumFilterPos;
2940
    int16_t *vChrFilterPos= c->vChrFilterPos;
2941
    int16_t *hLumFilterPos= c->hLumFilterPos;
2942
    int16_t *hChrFilterPos= c->hChrFilterPos;
2943
    int16_t *vLumFilter= c->vLumFilter;
2944
    int16_t *vChrFilter= c->vChrFilter;
2945
    int16_t *hLumFilter= c->hLumFilter;
2946
    int16_t *hChrFilter= c->hChrFilter;
2947
    int32_t *lumMmxFilter= c->lumMmxFilter;
2948
    int32_t *chrMmxFilter= c->chrMmxFilter;
2949
    const int vLumFilterSize= c->vLumFilterSize;
2950
    const int vChrFilterSize= c->vChrFilterSize;
2951
    const int hLumFilterSize= c->hLumFilterSize;
2952
    const int hChrFilterSize= c->hChrFilterSize;
2953
    int16_t **lumPixBuf= c->lumPixBuf;
2954
    int16_t **chrPixBuf= c->chrPixBuf;
2955
    const int vLumBufSize= c->vLumBufSize;
2956
    const int vChrBufSize= c->vChrBufSize;
2957
    uint8_t *funnyYCode= c->funnyYCode;
2958
    uint8_t *funnyUVCode= c->funnyUVCode;
2959
    uint8_t *formatConvBuffer= c->formatConvBuffer;
2960
    const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2961
    const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2962
    int lastDstY;
2963
    uint8_t *pal=NULL;
2964

    
2965
    /* vars whch will change and which we need to storw back in the context */
2966
    int dstY= c->dstY;
2967
    int lumBufIndex= c->lumBufIndex;
2968
    int chrBufIndex= c->chrBufIndex;
2969
    int lastInLumBuf= c->lastInLumBuf;
2970
    int lastInChrBuf= c->lastInChrBuf;
2971

    
2972
    if (isPacked(c->srcFormat)){
2973
        pal= src[1];
2974
        src[0]=
2975
        src[1]=
2976
        src[2]= src[0];
2977
        srcStride[0]=
2978
        srcStride[1]=
2979
        srcStride[2]= srcStride[0];
2980
    }
2981
    srcStride[1]<<= c->vChrDrop;
2982
    srcStride[2]<<= c->vChrDrop;
2983

    
2984
    //printf("swscale %X %X %X -> %X %X %X\n", (int)src[0], (int)src[1], (int)src[2],
2985
    //       (int)dst[0], (int)dst[1], (int)dst[2]);
2986

    
2987
#if 0 //self test FIXME move to a vfilter or something
2988
    {
2989
    static volatile int i=0;
2990
    i++;
2991
    if (srcFormat==PIX_FMT_YUV420P && i==1 && srcSliceH>= c->srcH)
2992
        selfTest(src, srcStride, c->srcW, c->srcH);
2993
    i--;
2994
    }
2995
#endif
2996

    
2997
    //printf("sws Strides:%d %d %d -> %d %d %d\n", srcStride[0],srcStride[1],srcStride[2],
2998
    //dstStride[0],dstStride[1],dstStride[2]);
2999

    
3000
    if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0)
3001
    {
3002
        static int firstTime=1; //FIXME move this into the context perhaps
3003
        if (flags & SWS_PRINT_INFO && firstTime)
3004
        {
3005
            av_log(c, AV_LOG_WARNING, "SwScaler: Warning: dstStride is not aligned!\n"
3006
                   "SwScaler:          ->cannot do aligned memory acesses anymore\n");
3007
            firstTime=0;
3008
        }
3009
    }
3010

    
3011
    /* Note the user might start scaling the picture in the middle so this will not get executed
3012
       this is not really intended but works currently, so ppl might do it */
3013
    if (srcSliceY ==0){
3014
        lumBufIndex=0;
3015
        chrBufIndex=0;
3016
        dstY=0;
3017
        lastInLumBuf= -1;
3018
        lastInChrBuf= -1;
3019
    }
3020

    
3021
    lastDstY= dstY;
3022

    
3023
    for (;dstY < dstH; dstY++){
3024
        unsigned char *dest =dst[0]+dstStride[0]*dstY;
3025
        const int chrDstY= dstY>>c->chrDstVSubSample;
3026
        unsigned char *uDest=dst[1]+dstStride[1]*chrDstY;
3027
        unsigned char *vDest=dst[2]+dstStride[2]*chrDstY;
3028

    
3029
        const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
3030
        const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
3031
        const int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
3032
        const int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
3033

    
3034
        //printf("dstY:%d dstH:%d firstLumSrcY:%d lastInLumBuf:%d vLumBufSize: %d vChrBufSize: %d slice: %d %d vLumFilterSize: %d firstChrSrcY: %d vChrFilterSize: %d c->chrSrcVSubSample: %d\n",
3035
        // dstY, dstH, firstLumSrcY, lastInLumBuf, vLumBufSize, vChrBufSize, srcSliceY, srcSliceH, vLumFilterSize, firstChrSrcY, vChrFilterSize,  c->chrSrcVSubSample);
3036
        //handle holes (FAST_BILINEAR & weird filters)
3037
        if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
3038
        if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
3039
        //printf("%d %d %d\n", firstChrSrcY, lastInChrBuf, vChrBufSize);
3040
        ASSERT(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1)
3041
        ASSERT(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1)
3042

    
3043
        // Do we have enough lines in this slice to output the dstY line
3044
        if (lastLumSrcY < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample))
3045
        {
3046
            //Do horizontal scaling
3047
            while(lastInLumBuf < lastLumSrcY)
3048
            {
3049
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3050
                lumBufIndex++;
3051
                //printf("%d %d %d %d\n", lumBufIndex, vLumBufSize, lastInLumBuf,  lastLumSrcY);
3052
                ASSERT(lumBufIndex < 2*vLumBufSize)
3053
                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3054
                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3055
                //printf("%d %d\n", lumBufIndex, vLumBufSize);
3056
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3057
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3058
                                funnyYCode, c->srcFormat, formatConvBuffer,
3059
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3060
                lastInLumBuf++;
3061
            }
3062
            while(lastInChrBuf < lastChrSrcY)
3063
            {
3064
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3065
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3066
                chrBufIndex++;
3067
                ASSERT(chrBufIndex < 2*vChrBufSize)
3068
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH))
3069
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3070
                //FIXME replace parameters through context struct (some at least)
3071

    
3072
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3073
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3074
                                    flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3075
                                    funnyUVCode, c->srcFormat, formatConvBuffer,
3076
                                    c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3077
                lastInChrBuf++;
3078
            }
3079
            //wrap buf index around to stay inside the ring buffer
3080
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3081
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3082
        }
3083
        else // not enough lines left in this slice -> load the rest in the buffer
3084
        {
3085
            /* printf("%d %d Last:%d %d LastInBuf:%d %d Index:%d %d Y:%d FSize: %d %d BSize: %d %d\n",
3086
            firstChrSrcY,firstLumSrcY,lastChrSrcY,lastLumSrcY,
3087
            lastInChrBuf,lastInLumBuf,chrBufIndex,lumBufIndex,dstY,vChrFilterSize,vLumFilterSize,
3088
            vChrBufSize, vLumBufSize);*/
3089

    
3090
            //Do horizontal scaling
3091
            while(lastInLumBuf+1 < srcSliceY + srcSliceH)
3092
            {
3093
                uint8_t *s= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
3094
                lumBufIndex++;
3095
                ASSERT(lumBufIndex < 2*vLumBufSize)
3096
                ASSERT(lastInLumBuf + 1 - srcSliceY < srcSliceH)
3097
                ASSERT(lastInLumBuf + 1 - srcSliceY >= 0)
3098
                RENAME(hyscale)(lumPixBuf[ lumBufIndex ], dstW, s, srcW, lumXInc,
3099
                                flags, canMMX2BeUsed, hLumFilter, hLumFilterPos, hLumFilterSize,
3100
                                funnyYCode, c->srcFormat, formatConvBuffer,
3101
                                c->lumMmx2Filter, c->lumMmx2FilterPos, pal);
3102
                lastInLumBuf++;
3103
            }
3104
            while(lastInChrBuf+1 < (chrSrcSliceY + chrSrcSliceH))
3105
            {
3106
                uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
3107
                uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
3108
                chrBufIndex++;
3109
                ASSERT(chrBufIndex < 2*vChrBufSize)
3110
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY < chrSrcSliceH)
3111
                ASSERT(lastInChrBuf + 1 - chrSrcSliceY >= 0)
3112

    
3113
                if (!(isGray(srcFormat) || isGray(dstFormat)))
3114
                    RENAME(hcscale)(chrPixBuf[ chrBufIndex ], chrDstW, src1, src2, chrSrcW, chrXInc,
3115
                            flags, canMMX2BeUsed, hChrFilter, hChrFilterPos, hChrFilterSize,
3116
                            funnyUVCode, c->srcFormat, formatConvBuffer,
3117
                            c->chrMmx2Filter, c->chrMmx2FilterPos, pal);
3118
                lastInChrBuf++;
3119
            }
3120
            //wrap buf index around to stay inside the ring buffer
3121
            if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
3122
            if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
3123
            break; //we can't output a dstY line so let's try with the next slice
3124
        }
3125

    
3126
#ifdef HAVE_MMX
3127
        b5Dither= dither8[dstY&1];
3128
        g6Dither= dither4[dstY&1];
3129
        g5Dither= dither8[dstY&1];
3130
        r5Dither= dither8[(dstY+1)&1];
3131
#endif
3132
        if (dstY < dstH-2)
3133
        {
3134
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3135
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3136
#ifdef HAVE_MMX
3137
            int i;
3138
        if (flags & SWS_ACCURATE_RND){
3139
            for (i=0; i<vLumFilterSize; i+=2){
3140
                lumMmxFilter[2*i+0]= (int32_t)lumSrcPtr[i  ];
3141
                lumMmxFilter[2*i+1]= (int32_t)lumSrcPtr[i+(vLumFilterSize>1)];
3142
                lumMmxFilter[2*i+2]=
3143
                lumMmxFilter[2*i+3]= vLumFilter[dstY*vLumFilterSize + i    ]
3144
                    + (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
3145
            }
3146
            for (i=0; i<vChrFilterSize; i+=2){
3147
                chrMmxFilter[2*i+0]= (int32_t)chrSrcPtr[i  ];
3148
                chrMmxFilter[2*i+1]= (int32_t)chrSrcPtr[i+(vChrFilterSize>1)];
3149
                chrMmxFilter[2*i+2]=
3150
                chrMmxFilter[2*i+3]= vChrFilter[chrDstY*vChrFilterSize + i    ]
3151
                    + (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
3152
            }
3153
        }else{
3154
            for (i=0; i<vLumFilterSize; i++)
3155
            {
3156
                lumMmxFilter[4*i+0]= (int32_t)lumSrcPtr[i];
3157
                lumMmxFilter[4*i+1]= (uint64_t)lumSrcPtr[i] >> 32;
3158
                lumMmxFilter[4*i+2]=
3159
                lumMmxFilter[4*i+3]=
3160
                    ((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001;
3161
            }
3162
            for (i=0; i<vChrFilterSize; i++)
3163
            {
3164
                chrMmxFilter[4*i+0]= (int32_t)chrSrcPtr[i];
3165
                chrMmxFilter[4*i+1]= (uint64_t)chrSrcPtr[i] >> 32;
3166
                chrMmxFilter[4*i+2]=
3167
                chrMmxFilter[4*i+3]=
3168
                    ((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001;
3169
            }
3170
        }
3171
#endif
3172
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3173
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3174
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3175
                RENAME(yuv2nv12X)(c,
3176
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3177
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3178
                    dest, uDest, dstW, chrDstW, dstFormat);
3179
            }
3180
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12 like
3181
            {
3182
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3183
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3184
                if (vLumFilterSize == 1 && vChrFilterSize == 1) // Unscaled YV12
3185
                {
3186
                    int16_t *lumBuf = lumPixBuf[0];
3187
                    int16_t *chrBuf= chrPixBuf[0];
3188
                    RENAME(yuv2yuv1)(lumBuf, chrBuf, dest, uDest, vDest, dstW, chrDstW);
3189
                }
3190
                else //General YV12
3191
                {
3192
                    RENAME(yuv2yuvX)(c,
3193
                        vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3194
                        vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3195
                        dest, uDest, vDest, dstW, chrDstW);
3196
                }
3197
            }
3198
            else
3199
            {
3200
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3201
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3202
                if (vLumFilterSize == 1 && vChrFilterSize == 2) //Unscaled RGB
3203
                {
3204
                    int chrAlpha= vChrFilter[2*dstY+1];
3205
                    RENAME(yuv2packed1)(c, *lumSrcPtr, *chrSrcPtr, *(chrSrcPtr+1),
3206
                        dest, dstW, chrAlpha, dstFormat, flags, dstY);
3207
                }
3208
                else if (vLumFilterSize == 2 && vChrFilterSize == 2) //BiLinear Upscale RGB
3209
                {
3210
                    int lumAlpha= vLumFilter[2*dstY+1];
3211
                    int chrAlpha= vChrFilter[2*dstY+1];
3212
                    lumMmxFilter[2]=
3213
                    lumMmxFilter[3]= vLumFilter[2*dstY   ]*0x10001;
3214
                    chrMmxFilter[2]=
3215
                    chrMmxFilter[3]= vChrFilter[2*chrDstY]*0x10001;
3216
                    RENAME(yuv2packed2)(c, *lumSrcPtr, *(lumSrcPtr+1), *chrSrcPtr, *(chrSrcPtr+1),
3217
                        dest, dstW, lumAlpha, chrAlpha, dstY);
3218
                }
3219
                else //General RGB
3220
                {
3221
                    RENAME(yuv2packedX)(c,
3222
                        vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3223
                        vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3224
                        dest, dstW, dstY);
3225
                }
3226
            }
3227
        }
3228
        else // hmm looks like we can't use MMX here without overwriting this array's tail
3229
        {
3230
            int16_t **lumSrcPtr= lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
3231
            int16_t **chrSrcPtr= chrPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
3232
            if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21){
3233
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3234
                if (dstY&chrSkipMask) uDest= NULL; //FIXME split functions in lumi / chromi
3235
                yuv2nv12XinC(
3236
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3237
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3238
                    dest, uDest, dstW, chrDstW, dstFormat);
3239
            }
3240
            else if (isPlanarYUV(dstFormat) || isGray(dstFormat)) //YV12
3241
            {
3242
                const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
3243
                if ((dstY&chrSkipMask) || isGray(dstFormat)) uDest=vDest= NULL; //FIXME split functions in lumi / chromi
3244
                yuv2yuvXinC(
3245
                    vLumFilter+dstY*vLumFilterSize   , lumSrcPtr, vLumFilterSize,
3246
                    vChrFilter+chrDstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3247
                    dest, uDest, vDest, dstW, chrDstW);
3248
            }
3249
            else
3250
            {
3251
                ASSERT(lumSrcPtr + vLumFilterSize - 1 < lumPixBuf + vLumBufSize*2);
3252
                ASSERT(chrSrcPtr + vChrFilterSize - 1 < chrPixBuf + vChrBufSize*2);
3253
                yuv2packedXinC(c,
3254
                    vLumFilter+dstY*vLumFilterSize, lumSrcPtr, vLumFilterSize,
3255
                    vChrFilter+dstY*vChrFilterSize, chrSrcPtr, vChrFilterSize,
3256
                    dest, dstW, dstY);
3257
            }
3258
        }
3259
    }
3260

    
3261
#ifdef HAVE_MMX
3262
    __asm __volatile(SFENCE:::"memory");
3263
    __asm __volatile(EMMS:::"memory");
3264
#endif
3265
    /* store changed local vars back in the context */
3266
    c->dstY= dstY;
3267
    c->lumBufIndex= lumBufIndex;
3268
    c->chrBufIndex= chrBufIndex;
3269
    c->lastInLumBuf= lastInLumBuf;
3270
    c->lastInChrBuf= lastInChrBuf;
3271

    
3272
    return dstY - lastDstY;
3273
}